From d2499ead93dc4298c0882fe98902acb1b5209f99 Mon Sep 17 00:00:00 2001
From: trav90 <travawine@palemoon.org>
Date: Fri, 19 Oct 2018 23:05:00 -0500
Subject: Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591

---
 third_party/aom/av1/common/alloccommon.h           |    6 +-
 third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 2447 ++++++++-
 third_party/aom/av1/common/arm/av1_inv_txfm_neon.h |    8 +-
 .../aom/av1/common/arm/blend_a64_hmask_neon.c      |    4 +-
 .../aom/av1/common/arm/blend_a64_vmask_neon.c      |    4 +-
 third_party/aom/av1/common/arm/cfl_neon.c          |    4 +-
 third_party/aom/av1/common/arm/convolve_neon.c     |  363 +-
 third_party/aom/av1/common/arm/convolve_neon.h     |    6 +-
 third_party/aom/av1/common/arm/jnt_convolve_neon.c |  512 +-
 third_party/aom/av1/common/arm/mem_neon.h          |   15 +-
 third_party/aom/av1/common/arm/selfguided_neon.c   |   18 +-
 third_party/aom/av1/common/arm/transpose_neon.h    |   83 +-
 third_party/aom/av1/common/arm/warp_plane_neon.c   |  714 +++
 .../aom/av1/common/arm/wiener_convolve_neon.c      |  145 +-
 third_party/aom/av1/common/av1_inv_txfm1d.c        |  140 +-
 third_party/aom/av1/common/av1_inv_txfm1d.h        |    6 +-
 third_party/aom/av1/common/av1_inv_txfm1d_cfg.h    |    6 +-
 third_party/aom/av1/common/av1_loopfilter.c        |  945 +++-
 third_party/aom/av1/common/av1_loopfilter.h        |  120 +-
 third_party/aom/av1/common/av1_rtcd_defs.pl        |   46 +-
 third_party/aom/av1/common/av1_txfm.c              |   50 +
 third_party/aom/av1/common/av1_txfm.h              |   32 +-
 third_party/aom/av1/common/blockd.c                |   64 +-
 third_party/aom/av1/common/blockd.h                |   53 +-
 third_party/aom/av1/common/cdef.h                  |    6 +-
 third_party/aom/av1/common/cdef_block.h            |    6 +-
 third_party/aom/av1/common/cdef_block_simd.h       |    5 +
 third_party/aom/av1/common/cfl.h                   |    6 +-
 third_party/aom/av1/common/common.h                |    6 +-
 third_party/aom/av1/common/common_data.h           |   75 +-
 third_party/aom/av1/common/convolve.c              |  116 +-
 third_party/aom/av1/common/convolve.h              |   21 +-
 third_party/aom/av1/common/entropy.h               |    6 +-
 third_party/aom/av1/common/entropymode.h           |    8 +-
 third_party/aom/av1/common/entropymv.c             |   55 -
 third_party/aom/av1/common/entropymv.h             |   16 +-
 third_party/aom/av1/common/enums.h                 |   12 +-
 third_party/aom/av1/common/filter.h                |   22 +-
 third_party/aom/av1/common/frame_buffers.c         |   11 +
 third_party/aom/av1/common/frame_buffers.h         |   12 +-
 third_party/aom/av1/common/idct.c                  |  274 +-
 third_party/aom/av1/common/idct.h                  |   31 +-
 third_party/aom/av1/common/mv.h                    |    8 +-
 third_party/aom/av1/common/mvref_common.c          |  381 +-
 third_party/aom/av1/common/mvref_common.h          |   43 +-
 third_party/aom/av1/common/obmc.h                  |   14 +-
 third_party/aom/av1/common/obu_util.c              |  147 +
 third_party/aom/av1/common/obu_util.h              |   47 +
 third_party/aom/av1/common/odintrin.h              |   12 +-
 third_party/aom/av1/common/onyxc_int.h             |   29 +-
 third_party/aom/av1/common/ppc/cfl_ppc.c           |   85 +-
 third_party/aom/av1/common/pred_common.c           |    4 +-
 third_party/aom/av1/common/pred_common.h           |    6 +-
 third_party/aom/av1/common/quant_common.h          |    6 +-
 third_party/aom/av1/common/reconinter.c            |  652 +--
 third_party/aom/av1/common/reconinter.h            |  148 +-
 third_party/aom/av1/common/reconintra.h            |    6 +-
 third_party/aom/av1/common/resize.c                |   39 +-
 third_party/aom/av1/common/resize.h                |    6 +-
 third_party/aom/av1/common/restoration.c           |  131 +-
 third_party/aom/av1/common/restoration.h           |    7 +-
 third_party/aom/av1/common/scale.h                 |    7 +-
 third_party/aom/av1/common/scan.h                  |    6 +-
 third_party/aom/av1/common/seg_common.h            |    6 +-
 third_party/aom/av1/common/thread_common.c         |   18 +-
 third_party/aom/av1/common/thread_common.h         |    6 +-
 third_party/aom/av1/common/tile_common.c           |   16 +
 third_party/aom/av1/common/tile_common.h           |    9 +-
 third_party/aom/av1/common/timing.h                |    6 +-
 third_party/aom/av1/common/token_cdfs.h            |    5 +
 third_party/aom/av1/common/txb_common.h            |  243 +-
 third_party/aom/av1/common/warped_motion.c         |    4 +-
 third_party/aom/av1/common/warped_motion.h         |    6 +-
 .../aom/av1/common/x86/av1_convolve_scale_sse4.c   |    1 -
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c |    6 +
 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h |    6 +-
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.c        |    6 +
 .../aom/av1/common/x86/av1_inv_txfm_ssse3.h        |   10 +-
 third_party/aom/av1/common/x86/av1_txfm_sse2.h     |    6 +-
 third_party/aom/av1/common/x86/av1_txfm_sse4.h     |   11 +-
 third_party/aom/av1/common/x86/cfl_simd.h          |    5 +
 third_party/aom/av1/common/x86/convolve_2d_avx2.c  |    2 -
 third_party/aom/av1/common/x86/convolve_2d_sse2.c  |    3 +-
 third_party/aom/av1/common/x86/convolve_sse2.c     |   11 +-
 .../aom/av1/common/x86/highbd_convolve_2d_avx2.c   |    1 -
 .../aom/av1/common/x86/highbd_convolve_2d_sse4.c   |    1 -
 .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c  |    1 -
 .../aom/av1/common/x86/highbd_inv_txfm_avx2.c      | 1117 +++-
 .../aom/av1/common/x86/highbd_inv_txfm_sse4.c      | 5335 +++++++++++++++-----
 .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c  |    1 -
 .../aom/av1/common/x86/highbd_txfm_utility_sse4.h  |   28 +-
 .../aom/av1/common/x86/highbd_warp_plane_sse4.c    |  268 +-
 third_party/aom/av1/common/x86/jnt_convolve_avx2.c |  211 +-
 third_party/aom/av1/common/x86/reconinter_avx2.c   |  496 ++
 third_party/aom/av1/common/x86/selfguided_avx2.c   |   23 +-
 third_party/aom/av1/common/x86/selfguided_sse4.c   |   24 +-
 third_party/aom/av1/common/x86/warp_plane_sse4.c   |  809 ++-
 .../aom/av1/common/x86/wiener_convolve_avx2.c      |    3 +-
 .../aom/av1/common/x86/wiener_convolve_sse2.c      |    3 +-
 99 files changed, 12657 insertions(+), 4313 deletions(-)
 create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c
 create mode 100644 third_party/aom/av1/common/obu_util.c
 create mode 100644 third_party/aom/av1/common/obu_util.h

(limited to 'third_party/aom/av1/common')

diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index dbcb5b947..8e5896981 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ALLOCCOMMON_H_
-#define AV1_COMMON_ALLOCCOMMON_H_
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
 
 #define INVALID_IDX -1  // Invalid buffer index.
 
@@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ALLOCCOMMON_H_
+#endif  // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
index 51c991498..bad411743 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <arm_neon.h>
+
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
@@ -19,19 +21,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
 #include "av1/common/arm/av1_inv_txfm_neon.h"
-
-static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
-  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
-  TxSetType tx_set_type;
-  if (tx_size_sqr_up > TX_32X32) {
-    tx_set_type = EXT_TX_SET_DCTONLY;
-  } else if (tx_size_sqr_up == TX_32X32) {
-    tx_set_type = EXT_TX_SET_DCT_IDTX;
-  } else {
-    tx_set_type = EXT_TX_SET_ALL16;
-  }
-  return tx_set_type;
-}
+#include "av1/common/arm/transpose_neon.h"
 
 // 1D itx types
 typedef enum ATTRIBUTE_PACKED {
@@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
   { av1_idct64_new, NULL, NULL },
 };
 
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+                                                  uint8_t *output, int stride,
+                                                  int flipud,
+                                                  const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  int16x8_t temp_output;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+    temp_output = vaddq_s16(temp_output, in[j]);
+    vst1_u8(output, vqmovun_s16(temp_output));
+    output += stride;
+  }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+                                                    int16x8_t res0,
+                                                    int16x8_t res1) {
+  int16x8_t temp_output[2];
+  uint8x16_t temp_output_8q;
+  temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+  temp_output[0] = vaddq_s16(temp_output[0], res0);
+  temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+  temp_output[1] = vaddq_s16(temp_output[1], res1);
+  temp_output_8q =
+      vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+  return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+                                                   uint8_t *output, int stride,
+                                                   int flipud, int height) {
+  uint8x16_t temp_output_8q;
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output_8q = vld1q_u8(output + i * stride);
+    temp_output_8q =
+        lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+    vst1q_u8((output + i * stride), temp_output_8q);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+                                                int value) {
+  for (int i = 0; i < size; i++) {
+    a[i] = vdupq_n_s16((int16_t)value);
+  }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+                               int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0_l, s0_h, s1_l, s1_h;
+  int16x4_t v0[2], v1[2];
+
+  s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+  s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+  s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+  s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+  v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+  int32x4_t t0[2], t1[2];
+  int16x4_t v0[2], v1[2];
+
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+  v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+  x[0] = vcombine_s16(v0[0], v0[1]);
+  x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+                                          int16_t *const c2,
+                                          int16_t *const c3) {
+  int16x4_t val = vdup_n_s16((int16_t)0);
+  val = vld1_lane_s16(c0, val, 0);
+  val = vld1_lane_s16(c1, val, 1);
+  val = vld1_lane_s16(c2, val, 2);
+  val = vld1_lane_s16(c3, val, 3);
+  return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // Stage 1
+  x[0] = in[7];
+  x[1] = in[0];
+  x[2] = in[5];
+  x[3] = in[2];
+  x[4] = in[3];
+  x[5] = in[4];
+  x[6] = in[1];
+  x[7] = in[6];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s4);
+  x[1] = vqaddq_s16(s1, s5);
+  x[2] = vqaddq_s16(s2, s6);
+  x[3] = vqaddq_s16(s3, s7);
+  x[4] = vqsubq_s16(s0, s4);
+  x[5] = vqsubq_s16(s1, s5);
+  x[6] = vqsubq_s16(s2, s6);
+  x[7] = vqsubq_s16(s3, s7);
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  s2 = x[2];
+  s3 = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+  // Stage 5
+  x[0] = vqaddq_s16(s0, s2);
+  x[1] = vqaddq_s16(s1, s3);
+  x[2] = vqsubq_s16(s0, s2);
+  x[3] = vqsubq_s16(s1, s3);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s4, s5;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+
+  btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[4] = s0;
+  x[5] = s1;
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+  // Stage 5
+  x[0] = s0;
+  x[1] = s1;
+  x[2] = s0;
+  x[3] = s1;
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                                  int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[8], step2[8];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+  btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+  // stage 3
+  btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+  // stage 4
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+  // stage 5
+  out[0] = vqaddq_s16(step1[0], step2[7]);
+  out[1] = vqaddq_s16(step1[1], step1[6]);
+  out[2] = vqaddq_s16(step1[2], step1[5]);
+  out[3] = vqaddq_s16(step1[3], step2[4]);
+  out[4] = vqsubq_s16(step1[3], step2[4]);
+  out[5] = vqsubq_s16(step1[2], step1[5]);
+  out[6] = vqsubq_s16(step1[1], step1[6]);
+  out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 4
+  // stage 5
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+  for (int i = 0; i < size; i++) {
+    arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+  }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+  int16x8_t temp[8];
+  for (int i = 0; i < size; ++i) {
+    temp[i] = input[size - 1 - i];
+  }
+  for (int i = 0; i < size; ++i) {
+    input[i] = temp[i];
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+                                                   int16x8_t *const a,
+                                                   int out_size) {
+  for (int i = 0; i < 8; ++i) {
+    a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+                        vmovn_s32(vld1q_s32(input + 4)));
+    input += out_size;
+  }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  output[0] = vmulq_n_s16(input[0], (int16_t)2);
+  output[1] = vmulq_n_s16(input[1], (int16_t)2);
+  output[2] = vmulq_n_s16(input[2], (int16_t)2);
+  output[3] = vmulq_n_s16(input[3], (int16_t)2);
+  output[4] = vmulq_n_s16(input[4], (int16_t)2);
+  output[5] = vmulq_n_s16(input[5], (int16_t)2);
+  output[6] = vmulq_n_s16(input[6], (int16_t)2);
+  output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+                                        int size) {
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+
+  for (int z = 0; z < size; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+  int16_t scale = (int16_t)(2 * NewSqrt2);
+
+  for (int z = 0; z < 16; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  for (int z = 0; z < 32; ++z) {
+    output[z] = vmulq_n_s16(input[z], (int16_t)4);
+  }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 4
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+  btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+  btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+  step2[0] = in[0];
+  step2[1] = in[8];
+  step2[2] = in[4];
+  step2[3] = in[12];
+  step2[4] = in[2];
+  step2[5] = in[10];
+  step2[6] = in[6];
+  step2[7] = in[14];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[4];
+  step2[4] = in[2];
+  step2[6] = in[6];
+
+  btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+  // stage 3
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+  btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+  btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+  btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[10];
+  int16x8_t s0, s1, s4, s5;
+  int16x8_t s8, s9, s12, s13;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[8] = s0;
+  x[9] = s1;
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+  // Stage 5
+  x[0] = t[0];
+  x[1] = t[1];
+  x[4] = t[0];
+  x[5] = t[1];
+  x[8] = s8;
+  x[9] = s9;
+  x[12] = s8;
+  x[13] = s9;
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  t[8] = x[8];
+  t[9] = x[9];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+  // Stage 7
+  x[0] = t[0];
+  x[1] = t[1];
+  x[2] = t[0];
+  x[3] = t[1];
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+  x[8] = t[8];
+  x[9] = t[9];
+  x[10] = t[8];
+  x[11] = t[9];
+  x[12] = s12;
+  x[13] = s13;
+  x[14] = s12;
+  x[15] = s13;
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[1] = in[0];
+  x[3] = in[2];
+  x[5] = in[4];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[10] = in[5];
+  x[12] = in[3];
+  x[14] = in[1];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+  btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+  btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+  btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+  btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+  btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+  btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+  btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c5 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c6 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c7 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+  btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+  btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+  btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+  btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+  btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+  step2[0] = in[0];
+  step2[1] = in[16];
+  step2[2] = in[8];
+  step2[3] = in[24];
+  step2[4] = in[4];
+  step2[5] = in[20];
+  step2[6] = in[12];
+  step2[7] = in[28];
+  step2[8] = in[2];
+  step2[9] = in[18];
+  step2[10] = in[10];
+  step2[11] = in[26];
+  step2[12] = in[6];
+  step2[13] = in[22];
+  step2[14] = in[14];
+  step2[15] = in[30];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+  btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+  btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+  btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[4] = step2[4];
+  step1[5] = step2[5];
+  step1[6] = step2[6];
+  step1[7] = step2[7];
+
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+  btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+  btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[1], step1[2]);
+  step2[2] = vqsubq_s16(step1[1], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[4];
+  step2[8] = in[2];
+  step2[12] = in[6];
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[16];
+  step1[18] = step2[19];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[21] = step2[20];
+  step1[22] = step2[23];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[24];
+  step1[26] = step2[27];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[29] = step2[28];
+  step1[30] = step2[31];
+  step1[31] = step2[31];
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[8];
+  step2[10] = step1[11];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[13] = step1[12];
+  step2[14] = step1[15];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = step2[4];
+  step1[5] = step2[4];
+  step1[6] = step2[7];
+  step1[7] = step2[7];
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[0];
+  step2[2] = step1[0];
+  step2[3] = step1[0];
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+  btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+  btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  step2[0] = in[0];
+  step2[2] = in[8];
+  step2[4] = in[4];
+  step2[6] = in[12];
+  step2[8] = in[2];
+  step2[10] = in[10];
+  step2[12] = in[6];
+  step2[14] = in[14];
+
+  // stage 3
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+  btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[0], step1[2]);
+  step2[2] = vqsubq_s16(step1[0], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
 // Functions for blocks with eob at DC and within
 // topleft 8x8, 16x16, 32x32 corner
 static const transform_1d_neon
@@ -90,10 +2112,37 @@ static const transform_1d_neon
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
-                                                  uint8_t *output, int stride,
-                                                  TX_TYPE tx_type,
-                                                  TX_SIZE tx_size, int eob) {
+
+static const transform_neon
+    lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+        { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+        { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+      {
+          { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+          { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+            NULL },
+          { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+            NULL },
+      },
+      { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+          idct32_new_neon },
+        { NULL, NULL, NULL, NULL },
+        { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+          identity32_new_neon } },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
   int32_t *temp_in = txfm_buf;
 
@@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  int16x8_t a[32 * 4];
+  int16x8_t b[32 * 4];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   }
 }
 
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
 static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
                                                  uint8_t *output, int stride,
                                                  TX_TYPE tx_type,
@@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
   DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
@@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   }
 }
 
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[64 * 8];
+  int16x8_t b[64 * 8];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+                                                tx_size, eob);
+      break;
+  }
+}
+
 static INLINE void lowbd_inv_txfm2d_add_universe_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
@@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
       break;
   }
 }
+
 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                    int eob) {
@@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
       break;
 
     case TX_16X64: {
-      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X16: {
@@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_32X64: {
-      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X32: {
@@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     case TX_64X64: {
@@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
       }
-      lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
-                                         tx_size, eob);
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
     } break;
 
     default:
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
index 6af2d61e7..9ec658291 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -23,6 +23,8 @@
 typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
                                   const int8_t cos_bit,
                                   const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+                               int8_t cos_bit, int bit);
 
 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
@@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
   *eoby = eob_fill[temp_eoby];
 }
 
-#endif  // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
index 0d8233744..7134f183e 100644
--- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
   uint8x8_t tmp0, tmp1;
   uint8x16_t res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32, tmp1_32;
-  uint16x4_t tmp0_16, tmp1_16;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
 
   if (w >= 16) {
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
index 33b06b767..194e94c8c 100644
--- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
   uint8x8_t tmp0, tmp1;
   uint8x16_t tmp0_q, tmp1_q, res_q;
   uint16x8_t res, res_low, res_high;
-  uint32x2_t tmp0_32, tmp1_32;
-  uint16x4_t tmp0_16, tmp1_16;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
index d731b6a66..39025b5e5 100644
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
-#if __ARM_ARCH <= 7
+#ifndef __aarch64__
 uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
                       vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
@@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
 
   // Permute and add in such a way that each lane contains the block sum.
   // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
-#if __ARM_ARCH >= 8
+#ifdef __aarch64__
   sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
   sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
 #else
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index f15744c94..d0c4f8ff6 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -13,6 +13,8 @@
 #include <assert.h>
 #include <arm_neon.h>
 
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
@@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8(
   return vqmovun_s16(sum);
 }
 
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  sum = vqrshl_s16(sum, shift_round_0);
+  sum = vqrshl_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif  // !defined(__arch64__)
+
 static INLINE uint8x8_t convolve8_vert_8x4(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
@@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   (void)conv_params;
   (void)filter_params_y;
 
-  uint8x8_t t0, t1, t2, t3;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3;
+#endif
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
 
   src -= horiz_offset;
-
+#if defined(__aarch64__)
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       w -= 4;
     } while (w > 0);
   } else {
+#endif
     int width;
     const uint8_t *s;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
     uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+#endif
 
     if (w <= 4) {
+#if defined(__aarch64__)
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         }
         h -= 8;
       } while (h > 0);
+#else
+    int16x8_t tt0;
+    int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+    const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+    const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
+      x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
+
+      t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
+
+      x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
+      x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
+      x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
+      x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
+      x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
+      x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
+
+      src += src_stride;
+
+      t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+                               shift_round_0_low, shift_by_bits_low);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+        dst += dst_stride;
+      }
+      h -= 1;
+    } while (h > 0);
+#endif
     } else {
       uint8_t *d;
-      int16x8_t s11, s12, s13, s14;
-
+      int16x8_t s11;
+#if defined(__aarch64__)
+      int16x8_t s12, s13, s14;
       do {
         __builtin_prefetch(src + 0 * src_stride);
         __builtin_prefetch(src + 1 * src_stride);
@@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
+#else
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      width = w;
+      s = src + 8;
+      d = dst;
+      __builtin_prefetch(dst);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s11 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        vst1_u8(d, t0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 1;
+    } while (h > 0);
+#endif
     }
+#if defined(__aarch64__)
   }
+#endif
 }
 
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
 
   if (w <= 4) {
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t d01;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    uint8x8_t d23;
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
     src += src_stride;
     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     do {
       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
       src += src_stride;
+#if defined(__aarch64__)
       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
       src += src_stride;
       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       s5 = s9;
       s6 = s10;
       h -= 4;
+#else
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+        dst += dst_stride;
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      h -= 1;
+#endif
     } while (h > 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
+    uint8x8_t t0;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+    uint8x8_t t1, t2, t3;
+    int16x8_t s8, s9, s10;
+#endif
     do {
       __builtin_prefetch(src + 0 * src_stride);
       __builtin_prefetch(src + 1 * src_stride);
@@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       do {
         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
+#if defined(__aarch64__)
         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
@@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s9;
         s6 = s10;
         height -= 4;
+#else
+        __builtin_prefetch(d);
+        __builtin_prefetch(s);
+
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
       } while (height > 0);
       src += 8;
       dst += 8;
@@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              ConvolveParams *conv_params) {
   int im_dst_stride;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
 
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
@@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   assert(conv_params->round_0 > 0);
 
   if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
 
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
     do {
       s = src_ptr;
+
+#if defined(__aarch64__)
       __builtin_prefetch(s + 0 * src_stride);
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
@@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * im_dst_stride;
       height -= 4;
+#else
+      int16x8_t tt0;
+
+      __builtin_prefetch(s);
+
+      t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s0 = vget_low_s16(tt0);
+      s4 = vget_high_s16(tt0);
+
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      if (w == 4) {
+        vst1_s16(dst_ptr, d0);
+        dst_ptr += im_dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        dst_ptr += im_dst_stride;
+      }
+
+      src_ptr += src_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   } else {
     int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
     int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+#endif
 
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
+#if defined(__aarch64__)
     do {
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
@@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       dst_ptr += 8 * im_dst_stride;
       height -= 8;
     } while (height > 0);
+#else
+    do {
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t sum = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += im_dst_stride;
+      height -= 1;
+    } while (height > 0);
+#endif
   }
 
   // vertical
@@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     width = w;
 
     if (width <= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x4_t d0, d1, d2, d3;
-      uint16x8_t dd0, dd1;
-      uint8x8_t d01, d23;
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint16x4_t d0;
+      uint16x8_t dd0;
+      uint8x8_t d01;
+
+#if defined(__aarch64__)
+      int16x4_t s8, s9, s10;
+      uint16x4_t d1, d2, d3;
+      uint16x8_t dd1;
+      uint8x8_t d23;
+#endif
 
       d_u8 = dst_u8_ptr;
       v_s = v_src_ptr;
@@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       v_s += (7 * im_stride);
 
       do {
+#if defined(__aarch64__)
         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
         v_s += (im_stride << 2);
 
@@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         s5 = s9;
         s6 = s10;
         height -= 4;
+#else
+        s7 = vld1_s16(v_s);
+        v_s += im_stride;
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+        d01 = vqmovn_u16(dd0);
+
+        if (w == 4) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+
+        } else if (w == 2) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
       } while (height > 0);
     } else {
       // if width is a multiple of 8 & height is a multiple of 4
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint8x8_t res0, res1, res2, res3;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t res0;
+#if defined(__aarch64__)
+      int16x8_t s8, s9, s10;
+      uint8x8_t res1, res2, res3;
+#endif
 
       do {
         __builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
         height = h;
 
         do {
+#if defined(__aarch64__)
           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
           v_s += (im_stride << 2);
 
@@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
           s5 = s9;
           s6 = s10;
           height -= 4;
+#else
+          s7 = vld1q_s16(v_s);
+          v_s += im_stride;
+
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          vst1_u8(d_u8, res0);
+          d_u8 += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+#endif
         } while (height > 0);
         v_src_ptr += 8;
         dst_u8_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
index 47c93d645..f382984f2 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.h
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
-#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32(
   return res;
 }
 
-#endif  // AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 4015082b4..e5674ef7c 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -22,12 +22,108 @@
 #include "av1/common/arm/mem_neon.h"
 #include "av1/common/arm/transpose_neon.h"
 
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const_vec,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0;
+  uint16x4_t tmp_u0;
+  uint32x4_t sum0;
+  int32x4_t dst0;
+  int16x8_t tmp4;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0, tmp2;
+  int16x8_t f0;
+  uint32x4_t sum0, sum2;
+  int32x4_t dst0, dst2;
+
+  uint16x8_t tmp_u0;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp2 = vqmovn_s32(dst2);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+
+    *t0 = vqmovun_s16(f0);
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+
+    *t0 = vqmovun_s16(f0);
+  }
+}
+#endif  // !defined(__arch64__)
+
 static INLINE void compute_avg_4x4(
     uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const_vec, const int16_t round_bits,
-    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
   int16x4_t tmp0, tmp1, tmp2, tmp3;
   uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4(
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const, const int16_t round_bits,
-    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
     uint8x8_t *t3) {
   int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int16x8_t f0, f1, f2, f3;
@@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   int16_t *dst_ptr;
   int dst_stride;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
 
   dst_ptr = im_block;
   dst_stride = im_stride;
@@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   width = w;
 
   if (w == 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint8x8_t t0;
 
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
 
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint8x8_t t1, t2, t3;
+#endif
     do {
       s = src;
       __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
       __builtin_prefetch(s + 3 * src_stride);
@@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon(
       src += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+      t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+      // a8 a9 a10 a11
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      vst1_s16(dst_ptr, d0);
+
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   } else {
     int16_t *d_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint8x8_t t0;
 
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
-
     do {
+#if defined(__aarch64__)
+      uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+      int16x8_t s8, s9, s10, s11, s12, s13, s14;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
       __builtin_prefetch(src + 0 * src_stride);
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
@@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon(
       src += 8 * src_stride;
       dst_ptr += 8 * dst_stride;
       height -= 8;
+#else
+      int16x8_t temp_0;
+      t0 = vld1_u8(src);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src + 8;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
     } while (height > 0);
   }
 }
@@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon(
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
 
-  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint16x4_t res4, res5, res6, res7;
-  uint16x4_t d0, d1, d2, d3;
-  uint8x8_t t0, t1;
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  uint16x4_t res4, d0;
+  uint8x8_t t0;
+
+#if defined(__aarch64__)
+  int16x4_t s8, s9, s10;
+  uint16x4_t res5, res6, res7, d1, d2, d3;
+  uint8x8_t t1;
+#endif
 
   dst = conv_params->dst;
   src_ptr = im_block;
@@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
     s += (7 * im_stride);
 
     do {
+#if defined(__aarch64__)
       load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
       s += (im_stride << 2);
 
@@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon(
                         bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
                         &t0, &t1);
 
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                      0);  // 00 01 02 03
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                      1);  // 10 11 12 13
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                      0);  // 20 21 22 23
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
         d_u8 += dst8_stride;
-        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                      1);  // 30 31 32 33
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
         d_u8 += dst8_stride;
 
       } else {
@@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon(
       s5 = s9;
       s6 = s10;
       height -= 4;
+#else
+      s7 = vld1_s16(s);
+      s += (im_stride);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        res4 = vld1_u16(d);
+        d += (dst_stride);
+
+        compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_jnt_comp_avg, &t0);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+
+      } else {
+        vst1_u16(d, d0);
+        d += (dst_stride);
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      height--;
+#endif
     } while (height > 0);
     src_ptr += 4;
     dst_ptr += 4;
@@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   uint8_t *dst_u8_ptr;
   CONV_BUF_TYPE *d, *dst_ptr;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
   s = src_ptr;
   dst_ptr = dst;
   dst_u8_ptr = dst8;
@@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   height = h;
 
   if ((w == 4) || (h == 4)) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint16x4_t res4, res5, res6, res7;
-    uint32x2_t tu0, tu1;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint16x4_t res4;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint16x4_t res5, res6, res7;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
     int16x8_t u0, u1;
+#else
+    int16x4_t temp_0;
+#endif
     const int16x4_t zero = vdup_n_s16(0);
     const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
     const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
@@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       d_u8 = dst_u8_ptr;
       width = w;
       __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
       __builtin_prefetch(s + 3 * src_stride);
@@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       dst_ptr += (dst_stride << 2);
       dst_u8_ptr += (dst8_stride << 2);
       height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(d);
+
+      s += 8;
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+        // a8 a9 a10 a11
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        temp_0 = s7;
+        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+        s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+        s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+        s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        s0 = s4;
+        s4 = temp_0;
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+          __builtin_prefetch(d_u8);
+
+          res4 = vld1_u16(d);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset_vec, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+        }
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride);
+      dst_ptr += (dst_stride);
+      dst_u8_ptr += (dst8_stride);
+      height--;
+#endif
     } while (height > 0);
   } else {
     CONV_BUF_TYPE *d_tmp;
     uint8_t *d_u8_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t res8, res9, res10, res11;
-
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
     const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
@@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
     d = dst_ptr = dst;
     d_u8 = dst_u8_ptr = dst8;
     do {
+#if defined(__aarch64__)
+      int16x8_t s11, s12, s13, s14;
+      int16x8_t s8, s9, s10;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      uint16x8_t res9, res10, res11;
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
       __builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       dst_ptr += 8 * dst_stride;
       dst_u8_ptr += 8 * dst8_stride;
       height -= 8;
+#else
+      int16x8_t temp_0;
+      __builtin_prefetch(src_ptr);
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        if (conv_params->do_average) {
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += (dst_stride);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst_u8_ptr += dst8_stride;
+      height--;
+#endif
     } while (height > 0);
   }
 }
@@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   uint8_t *dst_u8_ptr;
   CONV_BUF_TYPE *d, *dst_ptr;
   int width, height;
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
 
   s = src_ptr;
   dst_ptr = dst;
@@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   assert((conv_params->round_1 - 2) >= bits);
 
   if ((w == 4) || (h == 4)) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint16x4_t res4, res5, res6, res7;
-    uint32x2_t tu0, tu1, tu2, tu3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    uint16x4_t res4;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+               tu3 = vdup_n_u32(0);
     int16x8_t u0, u1, u2, u3;
+    uint8x8_t t0;
 
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    uint16x4_t res5, res6, res7;
+    uint8x8_t t1;
+#endif
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x4_t shift_vec = vdup_n_s16(-shift_value);
     const int16x4_t zero = vdup_n_s16(0);
@@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
       s += (7 * src_stride);
       do {
+#if defined(__aarch64__)
         load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
 
         u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
@@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
                           round_offset64, round_bits, use_jnt_comp_avg, &t0,
                           &t1);
 
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
-                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
           d_u8 += dst8_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
-                        1);  // 30 31 32 33
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
           d_u8 += dst8_stride;
         } else {
           store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
@@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
         s += (src_stride << 2);
         height -= 4;
+#else
+        load_unaligned_u8_4x1(s, src_stride, &tu0);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        s7 = vget_low_s16(u0);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+
+        d0 = vadd_s16(d0, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+
+          res4 = vld1_u16(d);
+          d += (dst_stride);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+          d += (dst_stride);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        s += (src_stride);
+        height--;
+#endif
       } while (height > 0);
       src_ptr += 4;
       dst_ptr += 4;
@@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
     } while (width > 0);
   } else {
     CONV_BUF_TYPE *d_tmp;
-    int16x8_t s11, s12, s13, s14;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t res8, res9, res10, res11;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
     const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
     const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
     const int16x4_t round_offset64 = vdup_n_s16(round_offset);
     const int16x8_t zero = vdupq_n_s16(0);
-
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res10, res11, res9;
+#endif
     dst_ptr = dst;
     dst_u8_ptr = dst8;
     do {
@@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       d_u8 = dst_u8_ptr;
 
       do {
+#if defined(__aarch64__)
         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
         s6 = s14;
         s += (8 * src_stride);
         height -= 8;
+#else
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp);
+
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += dst_stride;
+        }
+
+        s += (src_stride);
+        height--;
+#endif
       } while (height > 0);
       src_ptr += 8;
       dst_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 4bf45a52c..c4ae2e784 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_MEM_NEON_H_
-#define AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
 
 #include <arm_neon.h>
 #include <string.h>
@@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
   *tu1 = vset_lane_u32(a, *tu1, 1);
 }
 
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
                                          uint32x2_t *tu0) {
   uint32_t a;
@@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
   vst1q_u32(s, s4);
 }
 
-#endif  // AV1_COMMON_ARM_MEM_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
index b4808a972..b3a37c4cb 100644
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
       vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
 }
 
-void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
-                                int16_t *src, const int src_stride,
-                                int32_t *dst, const int dst_stride,
-                                const int width, const int height) {
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+                                       const int buf_stride, int16_t *src,
+                                       const int src_stride, int32_t *dst,
+                                       const int dst_stride, const int width,
+                                       const int height) {
   int16x8_t s0;
   int32_t *B_tmp, *dst_ptr;
   uint16_t *A_tmp;
@@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
-                                     int stride, int32_t *flt0, int32_t *flt1,
-                                     int flt_stride, int sgr_params_idx,
-                                     int bit_depth, int highbd) {
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+                                    int stride, int32_t *flt0, int32_t *flt1,
+                                    int flt_stride, int sgr_params_idx,
+                                    int bit_depth, int highbd) {
   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
@@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
   if (params->r[1] > 0)
     restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
                          bit_depth, sgr_params_idx, 1);
+  return 0;
 }
 
 void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index fe134087b..8a3d9f07f 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
 }
 
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+  const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+  const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+  const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  *out = d0.val[0];
+  *(out + 1) = d1.val[0];
+  *(out + 2) = d2.val[0];
+  *(out + 3) = d3.val[0];
+  *(out + 4) = d0.val[1];
+  *(out + 5) = d1.val[1];
+  *(out + 6) = d2.val[1];
+  *(out + 7) = d3.val[1];
+}
+
 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
                                       int16x4_t *a2, int16x4_t *a3) {
   // Swap 16 bit elements. Goes from:
@@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
   *a3 = c1.val[1];
 }
 
-#endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..7f02d42a7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+                            uint8x8_t src_1, int16x4_t *res) {
+  int16x8_t coeff_0, coeff_1;
+  int16x8_t pix_0, pix_1;
+
+  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+                         vreinterpret_s16_s32(x1.val[0]));
+  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+                         vreinterpret_s16_s32(x1.val[1]));
+
+  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+  pix_0 = vmulq_s16(coeff_0, pix_0);
+
+  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+                                          uint8x16_t src_3, uint8x16_t src_4,
+                                          int16x8_t *tmp_dst, int sx, int alpha,
+                                          int k, const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+                            255, 0, 255, 0, 255, 0, 255, 0 };
+  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int32x2x2_t b0, b1;
+  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+  int32x4_t tmp_res_low, tmp_res_high;
+  uint16x8_t res;
+  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+  src_1 = vaddq_u8(tmp_0, tmp_2);
+  src_2 = vaddq_u8(tmp_1, tmp_3);
+
+  src_1_low = vget_low_u8(src_1);
+  src_2_low = vget_low_u8(src_2);
+  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+  // Loading the 8 filter taps
+  f0 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f1 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f2 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f3 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f4 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f5 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f6 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f7 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+                vreinterpret_s32_s16(vget_low_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+                vreinterpret_s32_s16(vget_low_s16(f6)));
+  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+                vreinterpret_s32_s16(vget_low_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+                vreinterpret_s32_s16(vget_low_s16(f7)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+                vreinterpret_s32_s16(vget_high_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+                vreinterpret_s32_s16(vget_high_s16(f6)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+                vreinterpret_s32_s16(vget_high_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+                vreinterpret_s32_s16(vget_high_s16(f7)));
+  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+  res = vqrshlq_u16(res, shift);
+
+  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+                                        int32x4_t *res_low, int32x4_t *res_high,
+                                        int sy, int gamma) {
+  int16x4_t src_0, src_1, fltr_0, fltr_1;
+  int32x4_t res_0, res_1;
+  int32x2_t res_0_im, res_1_im;
+  int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int16x8x2_t b0, b1, b2, b3;
+  int32x4x2_t c0, c1, c2, c3;
+  int32x4x2_t d0, d1, d2, d3;
+
+  b0 = vtrnq_s16(src[0], src[1]);
+  b1 = vtrnq_s16(src[2], src[3]);
+  b2 = vtrnq_s16(src[4], src[5]);
+  b3 = vtrnq_s16(src[6], src[7]);
+
+  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                 vreinterpretq_s32_s16(b0.val[1]));
+  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+                 vreinterpretq_s32_s16(b1.val[1]));
+  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                 vreinterpretq_s32_s16(b2.val[1]));
+  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+                 vreinterpretq_s32_s16(b3.val[1]));
+
+  f0 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+  // row:0,1 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 even_col:0,2,4,6
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 even_col:0,2,4,6
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 even_col:0,2,4,6
+  res_even = vaddq_s32(im_res_0, im_res_1);
+
+  // row:0,1 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 odd_col:1,3,5,7
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 odd_col:1,3,5,7
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 odd_col:1,3,5,7
+  res_odd = vaddq_s32(im_res_0, im_res_1);
+
+  // reordering as 0 1 2 3 | 4 5 6 7
+  c0 = vtrnq_s32(res_even, res_odd);
+
+  // Final store
+  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  int16x8_t tmp[15];
+  const int bd = 8;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+  int limit = 0;
+  uint8x16_t vec_dup, mask_val;
+  int32x4_t res_lo, res_hi;
+  int16x8_t result_final;
+  uint8x16_t src_1, src_2, src_3, src_4;
+  uint8x16_t indx_vec = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  uint8x16_t cmp_vec;
+
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16x4_t res_sub_const =
+      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+                   (1 << (offset_bits - conv_params->round_1 - 1))));
+  int k;
+
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      // horizontal
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val =
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                            ref[iy * stride + (width - 1)] *
+                                (1 << (FILTER_BITS - reduce_bits_horiz));
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+
+          if (out_of_boundary_left >= 0) {
+            limit = out_of_boundary_left + 1;
+            cmp_vec = vdupq_n_u8(out_of_boundary_left);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcleq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          if (out_of_boundary_right >= 0) {
+            limit = 15 - (out_of_boundary_right + 1);
+            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcgeq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      }
+
+      // vertical
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        const int16x8_t *v_src = tmp + (k + 4);
+
+        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+        res_lo = vaddq_s32(res_lo, add_const_vert);
+        res_hi = vaddq_s32(res_hi, add_const_vert);
+
+        if (conv_params->is_compound) {
+          uint16_t *const p =
+              (uint16_t *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          if (conv_params->do_average) {
+            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+            uint16x4_t tmp16_lo = vld1_u16(p);
+            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+            int16x4_t tmp16_low;
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = vmulq_s32(res_lo, bwd);
+              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+            } else {
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+            }
+            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+            res_low = vqrshl_s16(res_low, round_bits_vec);
+            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+          } else {
+            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+            vst1_u16(p, res_u16_low);
+          }
+          if (p_width > 4) {
+            uint16_t *const p4 =
+                (uint16_t *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = vrshlq_s32(res_hi, shift_vert);
+            if (conv_params->do_average) {
+              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+              uint16x4_t tmp16_hi = vld1_u16(p4);
+              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+              int16x4_t tmp16_high;
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = vmulq_s32(res_hi, bwd);
+                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+              } else {
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+              }
+              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+              res_high = vqrshl_s16(res_high, round_bits_vec);
+              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+                            0);
+            } else {
+              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+              vst1_u16(p4, res_u16_high);
+            }
+          }
+        } else {
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          res_hi = vrshlq_s32(res_hi, shift_vert);
+
+          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+          result_final = vsubq_s16(result_final, sub_constant);
+
+          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+          uint8x8_t val = vqmovun_s16(result_final);
+
+          if (p_width == 4) {
+            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+          } else {
+            vst1_u8(p, val);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
index 72fbed4d4..a9bb5bcf0 100644
--- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -26,7 +26,6 @@
    Apply horizontal filter and store in a temporary buffer. When applying
    vertical filter, overwrite the original pixel values.
  */
-
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
@@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* if height is a multiple of 8 */
   if (!(h & 7)) {
     int16x8_t res0, res1, res2, res3;
-    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint16x8_t res4;
     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
     uint8x8_t t8, t9, t10, t11, t12, t13, t14;
 
     do {
@@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       dst_ptr += 8 * MAX_SB_SIZE;
       height -= 8;
     } while (height > 0);
+#else
+    uint8x8_t temp_0;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        vst1q_u16(d_tmp, res4);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height--;
+    } while (height > 0);
+#endif
   } else {
     /*if height is a multiple of 4*/
-    int16x8_t tt0, tt1, tt2, tt3;
     const uint8_t *s;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x8_t d0;
+    uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
     uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t d0, d1, d2, d3;
+    uint16x8_t d1, d2, d3;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
     int16x4_t s11, s12, s13, s14;
-    uint8x8_t t0, t1, t2, t3;
-
     do {
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
@@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       dst_ptr += 4 * MAX_SB_SIZE;
       height -= 4;
     } while (height > 0);
+#else
+    uint8x8_t temp_0, t4, t5, t6, t7;
+
+    do {
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      __builtin_prefetch(dst_ptr);
+
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+                                        conv_params->round_0);
+
+        vst1q_u16(d_tmp, d0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height -= 1;
+    } while (height > 0);
+#endif
   }
 
   {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x8_t t0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t1, t2, t3;
+#endif
     int16_t *src_tmp_ptr, *s;
     uint8_t *dst_tmp_ptr;
     height = h;
@@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
       d = dst_tmp_ptr;
       height = h;
 
+#if defined(__aarch64__)
       do {
         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
@@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       w -= 8;
     } while (w > 0);
+#else
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+      } while (height > 0);
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#endif
   }
 }
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 8514dc64c..7ef2d6d7f 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -11,56 +11,7 @@
 
 #include <stdlib.h>
 #include "av1/common/av1_inv_txfm1d.h"
-
-static void range_check_buf(int32_t stage, const int32_t *input,
-                            const int32_t *buf, int32_t size, int8_t bit) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  const int64_t max_value = (1LL << (bit - 1)) - 1;
-  const int64_t min_value = -(1LL << (bit - 1));
-
-  int in_range = 1;
-
-  for (int i = 0; i < size; ++i) {
-    if (buf[i] < min_value || buf[i] > max_value) {
-      in_range = 0;
-    }
-  }
-
-  if (!in_range) {
-    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
-    fprintf(stderr, "size: %d\n", size);
-    fprintf(stderr, "stage: %d\n", stage);
-    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
-            max_value);
-
-    fprintf(stderr, "coeffs: ");
-
-    fprintf(stderr, "[");
-    for (int j = 0; j < size; j++) {
-      if (j > 0) fprintf(stderr, ", ");
-      fprintf(stderr, "%d", input[j]);
-    }
-    fprintf(stderr, "]\n");
-
-    fprintf(stderr, "   buf: ");
-
-    fprintf(stderr, "[");
-    for (int j = 0; j < size; j++) {
-      if (j > 0) fprintf(stderr, ", ");
-      fprintf(stderr, "%d", buf[j]);
-    }
-    fprintf(stderr, "]\n\n");
-  }
-
-  assert(in_range);
-#else
-  (void)stage;
-  (void)input;
-  (void)buf;
-  (void)size;
-  (void)bit;
-#endif
-}
+#include "av1/common/av1_txfm.h"
 
 // TODO(angiebird): Make 1-d txfm functions static
 //
@@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = input[2];
   bf1[2] = input[1];
   bf1[3] = input[3];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = input[5];
   bf1[6] = input[3];
   bf1[7] = input[7];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = input[11];
   bf1[14] = input[7];
   bf1[15] = input[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = input[23];
   bf1[30] = input[15];
   bf1[31] = input[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   output[1] = round_shift(x1, bit);
   output[2] = round_shift(x2, bit);
   output[3] = round_shift(x3, bit);
-  range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
 void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = input[4];
   bf1[6] = input[1];
   bf1[7] = input[6];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[5] = bf0[5];
   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = input[12];
   bf1[14] = input[1];
   bf1[15] = input[14];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
   bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
   bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
   bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[13] = bf0[13];
   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = input[47];
   bf1[62] = input[31];
   bf1[63] = input[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
@@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
   bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
   bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
@@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
   bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
@@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
   bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
@@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
   bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
@@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
@@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
@@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
@@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
   bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
   bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 10
   stage++;
@@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check_buf(stage, input, bf1, size, stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 64a1a921c..c31c019aa 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_INV_TXFM1D_H_
-#define AV1_INV_TXFM1D_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
 
 #include "av1/common/av1_txfm.h"
 
@@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
 }
 #endif
 
-#endif  // AV1_INV_TXFM1D_H_
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 4c600f756..7d80a0099 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_INV_TXFM2D_CFG_H_
-#define AV1_INV_TXFM2D_CFG_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
 #include "av1/common/av1_inv_txfm1d.h"
 
 // sum of fwd_shift_##
@@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
 extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
 extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
-#endif  // AV1_INV_TXFM2D_CFG_H_
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 9d68b8760..537d8dfe9 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -68,23 +68,6 @@ static const int mode_lf_lut[] = {
 //    10101010|10101010
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-// TODO(chengchen): make these tables static
-const FilterMask left_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4,
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
-  { { 0x5555555555555555ULL,  // TX_8X8,
-      0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
-
-  { { 0x1111111111111111ULL,  // TX_16X16,
-      0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
-
-  { { 0x0101010101010101ULL,  // TX_32X32,
-      0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
-
-  { { 0x0001000100010001ULL,  // TX_64X64,
-      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
 
 // 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
 // We use 4 uint64_t to represent the 256 bit.
@@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = {
 //    00000000|00000000
 //
 // A loopfilter should be applied to every other 4x4 horizontally.
-const FilterMask above_txform_mask[TX_SIZES] = {
-  { { 0xffffffffffffffffULL,  // TX_4X4
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-  { { 0x0000ffff0000ffffULL,  // TX_8X8
-      0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_16X16
-      0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
-
-  { { 0x000000000000ffffULL,  // TX_32X32
-      0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000ffffULL,  // TX_64X64
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
 };
 
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 4x4 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
-  { { 0x0000000000000001ULL,  // BLOCK_4X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000010001ULL,  // BLOCK_4X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000000003ULL,  // BLOCK_8X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000030003ULL,  // BLOCK_8X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00000000000f000fULL,  // BLOCK_16X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000f000f000f000fULL,  // BLOCK_16X32
-      0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X32
-      0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X64
-      0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X32
-      0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0xffffffffffffffffULL,  // BLOCK_64X64
-      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-  // Y plane max coding block size is 128x128, but the codec divides it
-  // into 4 64x64 blocks.
-  // BLOCK_64X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X64
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-  // BLOCK_128X128
-  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-
-  { { 0x0001000100010001ULL,  // BLOCK_4X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x000000000000000fULL,  // BLOCK_16X4
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0003000300030003ULL,  // BLOCK_8X32
-      0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
 
-  { { 0x0000000000ff00ffULL,  // BLOCK_32X8
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
 
-  { { 0x000f000f000f000fULL,  // BLOCK_16X64
-      0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, 0,  1,  2,
+                                                      3,  -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
 
-  { { 0xffffffffffffffffULL,  // BLOCK_64X16
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
+const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
 };
 
 LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
                                      int mi_col) {
-  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-      (mi_col << MI_SIZE_LOG2) >= cm->width)
-    return NULL;
   assert(cm->lf.lfm != NULL);
   const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
   const int col = mi_col >> MIN_MIB_SIZE_LOG2;
@@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
            SIMD_WIDTH);
   }
 }
-static uint8_t get_filter_level(const AV1_COMMON *cm,
-                                const loop_filter_info_n *lfi_n,
-                                const int dir_idx, int plane,
-                                const MB_MODE_INFO *mbmi) {
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+                         const int dir_idx, int plane,
+                         const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
   if (cm->delta_lf_present_flag) {
     int delta_lf;
@@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
       }
     }
   }
-
-#if LOOP_FILTER_BITMASK
-  memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
-         sizeof(TX_SIZE) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
-         sizeof(uint8_t) * MI_SIZE_64X64);
-  memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
-#endif  // LOOP_FILTER_BITMASK
 }
 
 #if LOOP_FILTER_BITMASK
@@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
 // After locating which uint64_t, mi_row % 4 is the
 // row offset, and each row has 16 = 1 << stride_log2 4x4 units.
 // Therefore, shift = (row << stride_log2) + mi_col;
-static int get_index_shift(int mi_col, int mi_row, int *index) {
+int get_index_shift(int mi_col, int mi_row, int *index) {
   // *index = mi_row >> 2;
   // rows = mi_row % 4;
   // stride_log2 = 4;
@@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
           else
             lfm->lfl_y_hor[row][col] = level;
         } else if (plane == 1) {
-          if (dir == VERT_EDGE)
-            lfm->lfl_u_ver[row][col] = level;
-          else
-            lfm->lfl_u_hor[row][col] = level;
+          lfm->lfl_u[row][col] = level;
         } else {
-          if (dir == VERT_EDGE)
-            lfm->lfl_v_ver[row][col] = level;
-          else
-            lfm->lfl_v_hor[row][col] = level;
+          lfm->lfl_v[row][col] = level;
         }
       }
     }
@@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
           const TX_SIZE prev_tx_size =
               plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
                     : mbmi_prev->tx_size;
-          const TX_SIZE min_tx_size =
-              (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
-                                          txsize_horz_map[prev_tx_size])
-                                 : AOMMIN(txsize_vert_map[tx_size],
-                                          txsize_vert_map[prev_tx_size]);
+          TX_SIZE min_tx_size = (dir == VERT_EDGE)
+                                    ? AOMMIN(txsize_horz_map[tx_size],
+                                             txsize_horz_map[prev_tx_size])
+                                    : AOMMIN(txsize_vert_map[tx_size],
+                                             txsize_vert_map[prev_tx_size]);
+          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
           assert(min_tx_size < TX_SIZES);
           const int row = r % MI_SIZE_64X64;
           const int col = c % MI_SIZE_64X64;
@@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
       } else if (plane == 1) {
         av1_zero(lfm->left_u);
         av1_zero(lfm->above_u);
-        av1_zero(lfm->lfl_u_ver);
-        av1_zero(lfm->lfl_u_hor);
+        av1_zero(lfm->lfl_u);
       } else {
         av1_zero(lfm->left_v);
         av1_zero(lfm->above_v);
-        av1_zero(lfm->lfl_v_ver);
-        av1_zero(lfm->lfl_v_hor);
+        av1_zero(lfm->lfl_v);
       }
     }
   }
@@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2(
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
           if (plane) {
-            // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
-            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                               lfi1->hev_thr);
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
           } else {
-            // TODO(any): add dual function simd function. Current sse2 code
-            // just called aom_lpf_vertical_14_sse2 twice.
             aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                      lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                      lfi1->hev_thr);
@@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2(
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
           if (plane) {
-            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                               lfi1->hev_thr);
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
           } else {
             aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
@@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2(
 
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
           if (plane) {
-            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
-                                      lfi0->hev_thr, bd);
-            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
-                                      lfi1->lim, lfi1->hev_thr, bd);
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
           } else {
             aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                             lfi0->hev_thr, lfi1->mblim,
@@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2(
 
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
           if (plane) {
-            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
-                                      lfi0->hev_thr, bd);
-            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
-                                      lfi1->lim, lfi1->hev_thr, bd);
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
           } else {
             aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                            lfi0->hev_thr, lfi1->mblim,
@@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
 
         if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          /*
-          aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                     lfi->hev_thr);
-          */
-
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
           count = 2;
         } else {
           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
 
         if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          /*
-          aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          */
-
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
           count = 2;
         } else {
           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          /*
           aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
-          */
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-          aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
-                               lfin->hev_thr);
           count = 2;
         } else {
           aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz(
             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
 
         if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          /*
-          aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, bd);
-          */
-
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
-                                lfin->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
           count = 2;
         } else {
           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz(
             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
 
         if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          /*
-          aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, lfin->mblim, lfin->lim,
-                                           lfin->hev_thr, bd);
-          */
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
-                                lfin->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          }
           count = 2;
         } else {
           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz(
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          /*
           aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
-          */
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-          aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr, bd);
           count = 2;
         } else {
           aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
@@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz(
   }
 }
 
-static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
-                           uint8_t *dst_buf, int ref_stride, int dst_stride,
-                           int start, int end) {
-  return 0;
-
-  start <<= MI_SIZE_LOG2;
-  end <<= MI_SIZE_LOG2;
-  uint8_t *ref0 = ref_buf;
-  uint8_t *dst0 = dst_buf;
-  if (cm->seq_params.use_highbitdepth) {
-    const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
-    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
-    for (int j = 0; j < 4; ++j) {
-      for (int i = start; i < end; ++i)
-        if (ref16[i] != dst16[i]) {
-          ref_buf = ref0;
-          dst_buf = dst0;
-          return i + 1;
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
         }
-      ref16 += ref_stride;
-      dst16 += dst_stride;
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
     }
-  } else {
-    for (int j = 0; j < 4; ++j) {
-      for (int i = start; i < end; ++i)
-        if (ref_buf[i] != dst_buf[i]) {
-          ref_buf = ref0;
-          dst_buf = dst0;
-          return i + 1;
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
         }
-      ref_buf += ref_stride;
-      dst_buf += dst_stride;
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        lfl2 = &lfm->lfl_u[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        lfl2 = &lfm->lfl_v[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
     }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+    dst->buf += row_stride;
   }
-  ref_buf = ref0;
-  dst_buf = dst0;
-  return 0;
+  // reset buf pointer for next block
+  dst->buf = buf0;
 }
 
 void av1_filter_block_plane_ver(AV1_COMMON *const cm,
@@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
           mask_16x16 = lfm->left_u[TX_16X16].bits[index];
           mask_8x8 = lfm->left_u[TX_8X8].bits[index];
           mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_ver[row][col];
-          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          lfl = &lfm->lfl_u[row][col];
+          lfl2 = &lfm->lfl_u[row_next][col];
           break;
         case 2:
           mask_16x16 = lfm->left_v[TX_16X16].bits[index];
           mask_8x8 = lfm->left_v[TX_8X8].bits[index];
           mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_ver[row][col];
-          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          lfl = &lfm->lfl_v[row][col];
+          lfl2 = &lfm->lfl_v[row_next][col];
           break;
         default: assert(pl >= 0 && pl <= 2); return;
       }
@@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
           mask_16x16 = lfm->above_u[TX_16X16].bits[index];
           mask_8x8 = lfm->above_u[TX_8X8].bits[index];
           mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_hor[row][col];
+          lfl = &lfm->lfl_u[row][col];
           break;
         case 2:
           mask_16x16 = lfm->above_v[TX_16X16].bits[index];
           mask_8x8 = lfm->above_v[TX_8X8].bits[index];
           mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_hor[row][col];
+          lfl = &lfm->lfl_v[row][col];
           break;
         default: assert(pl >= 0 && pl <= 2); return;
       }
@@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
 
 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                              MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+                             int is_decoding,
+#endif
                              int plane_start, int plane_end) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
@@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
   int mi_row, mi_col;
   int plane;
 
+#if LOOP_FILTER_BITMASK
+  if (is_decoding) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+                           plane, plane + 1);
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+      // apply loop filtering which only goes through buffer once
+      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+          av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+                               plane, plane + 1);
+          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+                                              mi_col);
+          if (mi_col - MI_SIZE_64X64 >= 0) {
+            av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
+            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                                mi_col - MI_SIZE_64X64);
+          }
+        }
+        av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                             mi_col - MI_SIZE_64X64, plane, plane + 1);
+        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                            mi_col - MI_SIZE_64X64);
+      }
+    }
+    return;
+  }
+#endif
+
   for (plane = plane_start; plane < plane_end; plane++) {
     if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
       break;
@@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd, int plane_start, int plane_end,
-                           int partial_frame) {
+                           MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+                           int is_decoding,
+#endif
+                           int plane_start, int plane_end, int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
-  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                   plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+                   is_decoding,
+#endif
+                   plane_start, plane_end);
 }
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index c35c3b2dc..80ac61178 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_LOOPFILTER_H_
-#define AV1_COMMON_LOOPFILTER_H_
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
 
 #include "config/aom_config.h"
 
@@ -60,51 +60,20 @@ typedef struct {
   uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
   uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
 
-  // U plane vertical edge and horizontal edge filter level
-  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  // U plane filter level
+  uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
 
-  // V plane vertical edge and horizontal edge filter level
-  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-} LoopFilterMask;
+  // V plane filter level
+  uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
 
-// To determine whether to apply loop filtering at one transform block edge,
-// we need information of the neighboring transform block. Specifically,
-// in determining a vertical edge, we need the information of the tx block
-// to its left. For a horizontal edge, we need info of the tx block above it.
-// Thus, we need to record info of right column and bottom row of tx blocks.
-// We record the information of the neighboring superblock, when bitmask
-// building for a superblock is finished. And it will be used for next
-// superblock bitmask building.
-// Information includes:
-// ------------------------------------------------------------
-//                    MI_SIZE_64X64
-// Y  tx_size above |--------------|
-// Y  tx_size left  |--------------|
-// UV tx_size above |--------------|
-// UV tx_size left  |--------------|
-// Y level above    |--------------|
-// Y level left     |--------------|
-// U level above    |--------------|
-// U level left     |--------------|
-// V level above    |--------------|
-// V level left     |--------------|
-// skip             |--------------|
-// ------------------------------------------------------------
-typedef struct {
-  TX_SIZE tx_size_y_above[MI_SIZE_64X64];
-  TX_SIZE tx_size_y_left[MI_SIZE_64X64];
-  TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
-  TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
-  uint8_t y_level_above[MI_SIZE_64X64];
-  uint8_t y_level_left[MI_SIZE_64X64];
-  uint8_t u_level_above[MI_SIZE_64X64];
-  uint8_t u_level_left[MI_SIZE_64X64];
-  uint8_t v_level_above[MI_SIZE_64X64];
-  uint8_t v_level_left[MI_SIZE_64X64];
-  uint8_t skip[MI_SIZE_64X64];
-} LpfSuperblockInfo;
+  // other info
+  FilterMask skip;
+  FilterMask is_vert_border;
+  FilterMask is_horz_border;
+  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+  FilterMask tx_size_ver[2][5];
+  FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
 #endif  // LOOP_FILTER_BITMASK
 
 struct loopfilter {
@@ -130,7 +99,6 @@ struct loopfilter {
   LoopFilterMask *lfm;
   size_t lfm_num;
   int lfm_stride;
-  LpfSuperblockInfo neighbor_sb_lpf_info;
 #endif  // LOOP_FILTER_BITMASK
 };
 
@@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm);
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int is_decoding,
+                           int plane_start, int plane_end, int partial_frame);
+#else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *mbd, int plane_start,
                            int plane_end, int partial_frame);
+#endif
 
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
@@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData {
   MACROBLOCKD *xd;
 } LFWorkerData;
 
+uint8_t get_filter_level(const struct AV1Common *cm,
+                         const loop_filter_info_n *lfi_n, const int dir_idx,
+                         int plane, const MB_MODE_INFO *mbmi);
 #if LOOP_FILTER_BITMASK
 void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
                        int plane, int subsampling_x, int subsampling_y,
@@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
 void av1_filter_block_plane_hor(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane, int pl,
                                 int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+                                     int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0x0000000000000001ULL,  // TX_4X4,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0000000000010001ULL,  // TX_8X8,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_16X16,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_32X32,
+      0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000003ULL,  // TX_8X8
+      0x000000000000000fULL,  // TX_16X16
+      0x00000000000000ffULL,  // TX_32X32
+      0x000000000000ffffULL,  // TX_64X64
+  },
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000005ULL,  // TX_8X8
+      0x0000000000000055ULL,  // TX_16X16
+      0x0000000000005555ULL,  // TX_32X32
+      0x0000000055555555ULL,  // TX_64X64
+  },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_LOOPFILTER_H_
+#endif  // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index fa8b34981..dee1f1c79 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
 specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
 
+
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 
-
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
 specialize qw/av1_filter_intra_predictor sse4_1/;
@@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
@@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
 
 add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou
 add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
-
 add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride
 
 # build compound seg mask functions
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
 
 add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
 
 #
 # Encoder functions below this point.
@@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
   specialize qw/av1_temporal_filter_apply sse2 msa/;
 
-  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE
 
@@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
   specialize qw/av1_get_nz_map_contexts sse2/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
-  specialize qw/av1_txb_init_levels sse4_1/;
+  specialize qw/av1_txb_init_levels sse4_1 avx2/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
+  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride,  double *M, double *H";
+  specialize qw/av1_compute_stats sse4_1 avx2/;
+
+  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
 }
 # end encoder functions
 
@@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1/;
+specialize qw/av1_warp_affine sse4_1 neon/;
 
 add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_highbd_warp_affine sse4_1/;
@@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
 
-add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
-                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
-                                  int sgr_params_idx, int bit_depth, int highbd";
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd";
 specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
index 1e6654121..bb70eab70 100644
--- a/third_party/aom/av1/common/av1_txfm.c
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
   1,   // TXFM_TYPE_IDENTITY16
   1,   // TXFM_TYPE_IDENTITY32
 };
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+
+  int in_range = 1;
+
+  for (int i = 0; i < size; ++i) {
+    if (buf[i] < min_value || buf[i] > max_value) {
+      in_range = 0;
+    }
+  }
+
+  if (!in_range) {
+    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
+    fprintf(stderr, "stage: %d\n", stage);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
+
+    fprintf(stderr, "coeffs: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", input[j]);
+    }
+    fprintf(stderr, "]\n");
+
+    fprintf(stderr, "   buf: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", buf[j]);
+    }
+    fprintf(stderr, "]\n\n");
+  }
+
+  assert(in_range);
+#else
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index c9cc79852..59d64ca4a 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_TXFM_H_
-#define AV1_TXFM_H_
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
 
 #include <assert.h>
 #include <math.h>
@@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5];
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
 
-static const int NewSqrt2Bits = 12;
+#define NewSqrt2Bits ((int32_t)12)
 // 2^12 * sqrt(2)
 static const int32_t NewSqrt2 = 5793;
 // 2^12 / sqrt(2)
@@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
 #if DO_RANGE_CHECK_CLAMP
   bit = AOMMIN(bit, 31);
-  return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+  return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
 #endif  // DO_RANGE_CHECK_CLAMP
   (void)bit;
   return value;
@@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) {
 static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
   int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+  int64_t intermediate = result_64 + (1LL << (bit - 1));
+  // NOTE(david.barker): The value 'result_64' may not necessarily fit
+  // into 32 bits. However, the result of this function is nominally
+  // ROUND_POWER_OF_TWO_64(result_64, bit)
+  // and that is required to fit into stage_range[stage] many bits
+  // (checked by range_check_buf()).
+  //
+  // Here we've unpacked that rounding operation, and it can be shown
+  // that the value of 'intermediate' here *does* fit into 32 bits
+  // for any conformant bitstream.
+  // The upshot is that, if you do all this calculation using
+  // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+  // then you'll still get the correct result.
+  // To provide a check on this logic, we assert that 'intermediate'
+  // would fit into an int32 if range checking is enabled.
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
+  assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
 #endif
-  return round_shift(result_64, bit);
+  return (int32_t)(intermediate >> bit);
 }
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
@@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) {
 static INLINE int get_txh_idx(TX_SIZE tx_size) {
   return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
 }
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit);
 #define MAX_TXWH_IDX 5
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
 
-#endif  // AV1_TXFM_H_
+#endif  // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 86b4b5d6c..2e796b656 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
   return above_mi->mode;
 }
 
-void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
-    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
-  const uint8_t txh_unit = tx_size_high_unit[tx_size];
-  const int step = txw_unit * txh_unit;
-  int i = 0, r, c;
-
-  // If mb_to_right_edge is < 0 we are in a situation in which
-  // the current block size extends into the UMV and we won't
-  // visit the sub blocks that are wholly within the UMV.
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-  int blk_row, blk_col;
-
-  const BLOCK_SIZE max_unit_bsize =
-      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
-  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
-  // Keep track of the row and column of the blocks we use so that we know
-  // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
-    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
-    // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
-      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
-      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
-        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
-          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
-          i += step;
-        }
-      }
-    }
-  }
-}
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes) {
-  for (int plane = 0; plane < num_planes; ++plane) {
-    if (!is_chroma_reference(mi_row, mi_col, bsize,
-                             xd->plane[plane].subsampling_x,
-                             xd->plane[plane].subsampling_y))
-      continue;
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                       int has_eob, int aoff, int loff) {
@@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
+  for (i = num_planes; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].subsampling_x = 1;
+    xd->plane[i].subsampling_y = 1;
+  }
 }
 
 const int16_t dr_intra_derivative[90] = {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 979f13bd9..a2311c1b0 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_BLOCKD_H_
-#define AV1_COMMON_BLOCKD_H_
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
 
 #include "config/aom_config.h"
 
@@ -38,13 +38,13 @@ extern "C" {
 #define MAX_DIFFWTD_MASK_BITS 1
 
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
   DIFFWTD_MASK_TYPES,
 } DIFFWTD_MASK_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
@@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
 }
 
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
-  return mode >= NEARESTMV && mode <= NEW_NEWMV;
+  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
 }
 
 typedef struct {
@@ -66,10 +66,10 @@ typedef struct {
 } BUFFER_SET;
 
 static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
-  return mode >= NEARESTMV && mode <= NEWMV;
+  return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
 }
 static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
-  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
 }
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
@@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
           mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
 }
 
-static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
-  return (type == COMPOUND_WEDGE);
-}
-
 static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
   return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
 }
@@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO {
   int mi_row;
   int mi_col;
 #endif
-  int num_proj_ref[2];
-  WarpedMotionParams wm_params[2];
+  int num_proj_ref;
+  WarpedMotionParams wm_params;
 
   // Index of the alpha Cb and alpha Cr combination
   int cfl_alpha_idx;
@@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
 }
 #endif
 
-enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
 
 struct buf_2d {
   uint8_t *buf;
@@ -500,6 +496,8 @@ typedef struct jnt_comp_params {
   int bck_offset;
 } JNT_COMP_PARAMS;
 
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
@@ -544,7 +542,7 @@ typedef struct macroblockd {
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
 
   // block dimension in the unit of mode_info.
-  uint8_t n8_w, n8_h;
+  uint8_t n4_w, n4_h;
 
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
@@ -599,6 +597,9 @@ typedef struct macroblockd {
   uint16_t cb_offset[MAX_MB_PLANE];
   uint16_t txb_offset[MAX_MB_PLANE];
   uint16_t color_index_map_offset[2];
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
 static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
@@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
   }
 }
 
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
 // Note: the input block size should be square.
 // Otherwise it's considered invalid.
 static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
@@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
   return intra_mode_to_tx_type(mbmi, plane_type);
 }
 
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
                                               int subsampling_x,
                                               int subsampling_y) {
@@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size, void *arg);
 
-void av1_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
-    foreach_transformed_block_visitor visit, void *arg);
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
-                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                   foreach_transformed_block_visitor visit,
-                                   void *arg, const int num_planes);
-
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                       int has_eob, int aoff, int loff);
@@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 }
 
 static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
-  return (mode >= NEARESTMV) && (mode <= NEWMV);
+  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
 }
 
 static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
@@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
       is_motion_variation_allowed_compound(mbmi)) {
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
-    if (mbmi->num_proj_ref[0] >= 1 &&
+    if (mbmi->num_proj_ref >= 1 &&
         (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
       if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
@@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_BLOCKD_H_
+#endif  // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 092230de9..3b2eac8a5 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_CDEF_H_
-#define AV1_COMMON_CDEF_H_
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
 
 #define CDEF_STRENGTH_BITS 6
 
@@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_COMMON_CDEF_H_
+#endif  // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index 81c6da077..6b4452cd6 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#if !defined(_CDEF_BLOCK_H)
-#define _CDEF_BLOCK_H (1)
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
 
 #include "av1/common/odintrin.h"
 
@@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift);
-#endif
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index d24a7c0fa..14587a023 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
 #include "config/av1_rtcd.h"
 
 #include "av1/common/cdef_block.h"
@@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
     }
   }
 }
+
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index bc9fbce1b..d627891bf 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_CFL_H_
-#define AV1_COMMON_CFL_H_
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
 
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
@@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
     return pred[tx_size % TX_SIZES_ALL];                                  \
   }
 
-#endif  // AV1_COMMON_CFL_H_
+#endif  // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 72c6d3a1e..bed6083db 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_COMMON_H_
-#define AV1_COMMON_COMMON_H_
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
 
 /* Interface header for common constant data structures and lookup tables */
 
@@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_COMMON_H_
+#endif  // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index f521f10bf..46e455fdb 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_COMMON_DATA_H_
-#define AV1_COMMON_COMMON_DATA_H_
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
 
 #include "av1/common/enums.h"
 #include "aom/aom_integer.h"
@@ -20,34 +20,43 @@
 extern "C" {
 #endif
 
-// Log 2 conversion lookup tables in units of mode info(4x4).
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
   0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
 };
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
   0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
 };
 
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
   1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
 };
 
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
 static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
   1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
 };
 
-// Width/height lookup tables in units of various block sizes
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
 static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
   4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
   64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
 };
 
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
 static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
   4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
   32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
 };
 
-// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
   0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };
@@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
   4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
 };
 
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
 /* clang-format off */
 static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
   {     // PARTITION_NONE
@@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_64X64,  // TX_MODE_LARGEST
   TX_64X64,  // TX_MODE_SELECT
 };
-/* clang-format on */
 
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
 static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
-  //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-  //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-  { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
-  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
-  { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
-  { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
-  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
-  { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
-  { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
-  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
-  { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
-  { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
-  { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
-  { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
-  { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
-  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
-  { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
-  { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
-  { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
-  { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
-  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
+  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
+  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
+  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
+  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
+  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
+  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
+  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
+  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
+  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
+  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
+  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
+  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
+  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
+  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
+  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
 };
+/* clang-format on */
 
 // Generates 5 bit field in which each bit set to 1 represents
 // a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
@@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_COMMON_DATA_H_
+#endif  // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index ed962c722..1f11126fc 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper(
                         y_step_qn, conv_params);
 }
 
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    int subpel_x_q4, int subpel_y_q4,
+                                    ConvolveParams *conv_params) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, 0, 0, conv_params);
+  } else if (subpel_x_q4 != 0) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  } else {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  }
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
                             int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf) {
+                            const struct scale_factors *sf, int is_intrabc) {
+  assert(IMPLIES(is_intrabc, !scaled));
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+                            subpel_y_q4, conv_params);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
   const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size(filter_x, w);
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
   const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size(filter_y, h);
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
 
-  if (scaled)
+  if (scaled) {
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
                               filter_params_x, filter_params_y, subpel_x_q4,
                               x_step_q4, subpel_y_q4, y_step_q4, conv_params);
-  else
+  } else {
     sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+  }
 }
 
 void av1_highbd_convolve_2d_copy_sr_c(
@@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
   }
 }
 
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+                                           uint16_t *dst, int dst_stride, int w,
+                                           int h, int subpel_x_q4,
+                                           int subpel_y_q4,
+                                           ConvolveParams *conv_params,
+                                           int bd) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, 0, 0,
+                                conv_params, bd);
+  } else if (subpel_x_q4 != 0) {
+    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  } else {
+    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  }
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd) {
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd) {
+  assert(IMPLIES(is_intrabc, !scaled));
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
-
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
   const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size(filter_x, w);
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
   const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size(filter_y, h);
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
 
   if (scaled) {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
 
   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
 
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index bc2d4bccf..4109dd843 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_AV1_CONVOLVE_H_
-#define AV1_COMMON_AV1_CONVOLVE_H_
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
 #include "av1/common/filter.h"
 
 #ifdef __cplusplus
@@ -19,7 +19,6 @@ extern "C" {
 
 typedef uint16_t CONV_BUF_TYPE;
 typedef struct ConvolveParams {
-  int ref;
   int do_average;
   CONV_BUF_TYPE *dst;
   int dst_stride;
@@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
                             int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf);
+                            const struct scale_factors *sf, int is_intrabc);
 
-static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
-                                                      int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.ref = ref;
   conv_params.do_average = do_average;
   assert(IMPLIES(do_average, is_compound));
   conv_params.is_compound = is_compound;
@@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
   return conv_params;
 }
 
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
                                              int bd) {
-  return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+  return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
 }
 
 static INLINE ConvolveParams get_conv_params_wiener(int bd) {
   ConvolveParams conv_params;
   (void)bd;
-  conv_params.ref = 0;
   conv_params.do_average = 0;
   conv_params.is_compound = 0;
   conv_params.round_0 = WIENER_ROUND0_BITS;
@@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd);
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_AV1_CONVOLVE_H_
+#endif  // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index ef944c5a0..991692c2f 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPY_H_
-#define AV1_COMMON_ENTROPY_H_
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
 
 #include "config/aom_config.h"
 
@@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPY_H_
+#endif  // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 0bd2e20a1..7047f34d2 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPYMODE_H_
-#define AV1_COMMON_ENTROPYMODE_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
 
 #include "av1/common/entropy.h"
 #include "av1/common/entropymv.h"
@@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas);
 void av1_setup_frame_contexts(struct AV1Common *cm);
 void av1_setup_past_independence(struct AV1Common *cm);
 
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
 static INLINE int av1_ceil_log2(int n) {
   if (n < 2) return 0;
   int i = 1, p = 2;
@@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPYMODE_H_
+#endif  // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 446aa433c..491337387 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = {
     } },
 };
 
-static const uint8_t log_in_base_2[] = {
-  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
-};
-
-static INLINE int mv_class_base(MV_CLASS_TYPE c) {
-  return c ? CLASS0_SIZE << (c + 2) : 0;
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
-  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
-                              ? MV_CLASS_10
-                              : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
-  if (offset) *offset = z - mv_class_base(c);
-  return c;
-}
-
 void av1_init_mv_probs(AV1_COMMON *cm) {
   // NB: this sets CDFs too
   cm->fc->nmvc = default_nmv_context;
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 02ca7b66b..fa818a2c1 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENTROPYMV_H_
-#define AV1_COMMON_ENTROPYMV_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
 
 #include "config/aom_config.h"
 
@@ -91,16 +91,6 @@ typedef struct {
   nmv_component comps[2];
 } nmv_context;
 
-static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
-  if (mv->row == 0) {
-    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
-  } else {
-    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
-  }
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
-
 typedef enum {
   MV_SUBPEL_NONE = -1,
   MV_SUBPEL_LOW_PRECISION = 0,
@@ -111,4 +101,4 @@ typedef enum {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENTROPYMV_H_
+#endif  // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index 689c25f30..869c06ef2 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ENUMS_H_
-#define AV1_COMMON_ENUMS_H_
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
 
 #include "config/aom_config.h"
 
@@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED {
   TX_TYPES,
 } TX_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED {
   COMP_INTER_MODE_START = NEAREST_NEARESTMV,
   COMP_INTER_MODE_END = MB_MODE_COUNT,
   COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTER_MODE_START = NEARESTMV,
+  INTER_MODE_END = MB_MODE_COUNT,
   INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
 } PREDICTION_MODE;
@@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED {
   INTERINTRA_MODES
 } INTERINTRA_MODE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   COMPOUND_AVERAGE,
   COMPOUND_WEDGE,
   COMPOUND_DIFFWTD,
@@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ENUMS_H_
+#endif  // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 7f8ad583a..571422d11 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_FILTER_H_
-#define AV1_COMMON_FILTER_H_
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
 
 #include <assert.h>
 
@@ -139,6 +139,17 @@ static const InterpFilterParams
         BILINEAR }
     };
 
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+  64,
+  64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
 DECLARE_ALIGNED(256, static const InterpKernel,
                 av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
@@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
   return &av1_interp_filter_params_list[interp_filter];
 }
 
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+    const InterpFilter interp_filter) {
+  return &av1_interp_4tap[interp_filter];
+}
+
 static INLINE const int16_t *av1_get_interp_filter_kernel(
     const InterpFilter interp_filter) {
   return av1_interp_filter_params_list[interp_filter].filter_ptr;
@@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_FILTER_H_
+#endif  // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 502ccd27d..fd6c4bc79 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
   list->int_fb = NULL;
 }
 
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    if (list->int_fb[i].data && !list->int_fb[i].in_use)
+      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+  }
+}
+
 int av1_get_frame_buffer(void *cb_priv, size_t min_size,
                          aom_codec_frame_buffer_t *fb) {
   int i;
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
index e7341cfdd..16188e51c 100644
--- a/third_party/aom/av1/common/frame_buffers.h
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_FRAME_BUFFERS_H_
-#define AV1_COMMON_FRAME_BUFFERS_H_
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
 
 #include "aom/aom_frame_buffer.h"
 #include "aom/aom_integer.h"
@@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
 // Free any data allocated to the frame buffers.
 void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
 
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
 // Callback used by libaom to request an external frame buffer. |cb_priv|
 // Callback private data, which points to an InternalFrameBufferList.
 // |min_size| is the minimum size in bytes needed to decode the next frame.
@@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_FRAME_BUFFERS_H_
+#endif  // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index bc758eb57..2c1cb9827 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) {
 // that input and output could be the same buffer.
 
 // idct
-static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
-                               int stride, int eob, int bd) {
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
   if (eob > 1)
     av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
     av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-static const int32_t *cast_to_int32(const tran_low_t *input) {
-  assert(sizeof(int32_t) == sizeof(tran_low_t));
-  return (const int32_t *)input;
-}
-
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
@@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (lossless) {
     assert(tx_type == DCT_DCT);
-    highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-  }
+
+  av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                         txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
-                         txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
-                                     int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                          txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
-}
 
-static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+
+  av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                              bd);
-      break;
-  }
 }
 
-static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    // Assembly version doesn't support some transform types, so use C version
-    // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-  }
+  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
 }
 
-static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
 
-    default: assert(0);
-  }
+  av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
 }
 
-static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
-                                      int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   assert(tx_type == DCT_DCT);
-  av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+  av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
 }
 
 static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
@@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
       txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
 
-static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
-                                int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+                               int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
     case TX_32X32:
-      highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
       break;
     case TX_16X16:
-      highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
       break;
     case TX_8X8:
-      highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
-      highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
       break;
     case TX_16X8:
-      highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
       break;
     case TX_16X32:
-      highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
       break;
     case TX_32X16:
-      highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
       break;
     case TX_64X64:
-      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
-      highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
       break;
     case TX_64X32:
-      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
       break;
     case TX_16X64:
-      highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
       break;
     case TX_64X16:
-      highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
       break;
     case TX_8X32:
-      highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
       break;
     case TX_32X8:
-      highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
@@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
     }
   }
 
-  highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          txfm_param);
 
   for (int r = 0; r < h; ++r) {
     for (int c = 0; c < w; ++c) {
@@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
   assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
 
   if (txfm_param.is_hbd) {
-    highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   } else {
     av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
   }
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index 50032a167..d9454e73f 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_IDCT_H_
-#define AV1_COMMON_IDCT_H_
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
 
 #include "config/aom_config.h"
 
@@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
                                  const tran_low_t *dqcoeff, int plane,
                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
                                  int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+  assert(sizeof(int32_t) == sizeof(tran_low_t));
+  return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
 
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_IDCT_H_
+#endif  // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index c2495640e..5b0225192 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_MV_H_
-#define AV1_COMMON_MV_H_
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
 
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
@@ -56,7 +56,7 @@ typedef struct mv32 {
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   IDENTITY = 0,      // identity transformation, 0-parameter
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
@@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_MV_H_
+#endif  // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 6939df335..7f24ab4e6 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
   den = AOMMIN(den, MAX_FRAME_DISTANCE);
   num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
                 : AOMMAX(num, -MAX_FRAME_DISTANCE);
-  int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
-  int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int mv_row =
+      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  const int mv_col =
+      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
   const int clamp_max = MV_UPP - 1;
   const int clamp_min = MV_LOW + 1;
   output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
   output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
 }
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
-                        int mi_row, int mi_col, int x_mis, int y_mis) {
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
   MV_REF *frame_mvs =
       cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
@@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           uint8_t *ref_match_count, uint8_t *newmv_count,
                           int_mv *gm_mv_candidates, int max_row_offset,
                           int *processed_rows) {
-  int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
   end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
   const int n8_w_8 = mi_size_wide[BLOCK_8X8];
   const int n8_w_16 = mi_size_wide[BLOCK_16X16];
   int i;
   int col_offset = 0;
-  const int shift = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
   }
-  const int use_step_16 = (xd->n8_w >= 16);
+  const int use_step_16 = (xd->n4_w >= 16);
   MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
   (void)mi_row;
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
-    const int n8_w = mi_size_wide[candidate_bsize];
-    int len = AOMMIN(xd->n8_w, n8_w);
+    const int n4_w = mi_size_wide[candidate_bsize];
+    int len = AOMMIN(xd->n4_w, n4_w);
     if (use_step_16)
       len = AOMMAX(n8_w_16, len);
     else if (abs(row_offset) > 1)
       len = AOMMAX(len, n8_w_8);
 
     int weight = 2;
-    if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) {
+    if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
       int inc = AOMMIN(-max_row_offset + row_offset + 1,
                        mi_size_high[candidate_bsize]);
       // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
+      weight = AOMMAX(weight, inc);
       // Update processed rows.
       *processed_rows = inc - row_offset - 1;
     }
@@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           uint8_t *ref_match_count, uint8_t *newmv_count,
                           int_mv *gm_mv_candidates, int max_col_offset,
                           int *processed_cols) {
-  int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
   end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
   int i;
   int row_offset = 0;
-  const int shift = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
   }
-  const int use_step_16 = (xd->n8_h >= 16);
+  const int use_step_16 = (xd->n4_h >= 16);
   (void)mi_col;
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
     const int candidate_bsize = candidate->sb_type;
-    const int n8_h = mi_size_high[candidate_bsize];
-    int len = AOMMIN(xd->n8_h, n8_h);
+    const int n4_h = mi_size_high[candidate_bsize];
+    int len = AOMMIN(xd->n4_h, n4_h);
     if (use_step_16)
       len = AOMMAX(n8_h_16, len);
     else if (abs(col_offset) > 1)
       len = AOMMAX(len, n8_h_8);
 
     int weight = 2;
-    if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) {
+    if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
       int inc = AOMMIN(-max_col_offset + col_offset + 1,
                        mi_size_wide[candidate_bsize]);
       // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
+      weight = AOMMAX(weight, inc);
       // Update processed cols.
       *processed_cols = inc - col_offset - 1;
     }
@@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   mi_pos.row = row_offset;
   mi_pos.col = col_offset;
 
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
     const MB_MODE_INFO *const candidate =
         xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
     const int len = mi_size_wide[BLOCK_8X8];
@@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
   // The left hand of two vertical rectangles always has a top right (as the
   // block above will have been decoded)
-  if (xd->n8_w < xd->n8_h)
+  if (xd->n4_w < xd->n4_h)
     if (!xd->is_sec_rect) has_tr = 1;
 
   // The bottom of two horizontal rectangles never has a top right (as the block
   // to the right won't have been decoded)
-  if (xd->n8_w > xd->n8_h)
+  if (xd->n4_w > xd->n4_h)
     if (xd->is_sec_rect) has_tr = 0;
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
   if (xd->mi[0]->partition == PARTITION_VERT_A) {
-    if (xd->n8_w == xd->n8_h)
+    if (xd->n4_w == xd->n4_h)
       if (mask_row & bs) has_tr = 0;
   }
 
@@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
 
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
+  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
 
   const TPL_MV_REF *prev_frame_mvs =
       cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
@@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   return 0;
 }
 
+static void process_compound_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+        ++ref_id_count[cmp_idx];
+      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+        int_mv this_mv = candidate->mv[rf_idx];
+        if (cm->ref_frame_sign_bias[can_rf] !=
+            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+          this_mv.as_mv.row = -this_mv.as_mv.row;
+          this_mv.as_mv.col = -this_mv.as_mv.col;
+        }
+        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+        ++ref_diff_count[cmp_idx];
+      }
+    }
+  }
+}
+
+static void process_single_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+      int_mv this_mv = candidate->mv[rf_idx];
+      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+          cm->ref_frame_sign_bias[ref_frame]) {
+        this_mv.as_mv.row = -this_mv.as_mv.row;
+        this_mv.as_mv.col = -this_mv.as_mv.col;
+      }
+      int stack_idx;
+      for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+        if (this_mv.as_int == stack_mv.as_int) break;
+      }
+
+      if (stack_idx == refmv_count[ref_frame]) {
+        ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+        // TODO(jingning): Set an arbitrary small number here. The weight
+        // doesn't matter as long as it is properly initialized.
+        ref_mv_stack[ref_frame][stack_idx].weight = 2;
+        ++refmv_count[ref_frame];
+      }
+    }
+  }
+}
+
 static void setup_ref_mv_list(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
     uint8_t refmv_count[MODE_CTX_REF_FRAMES],
     CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
     int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
     int mi_row, int mi_col, int16_t *mode_context) {
-  const int bs = AOMMAX(xd->n8_w, xd->n8_h);
+  const int bs = AOMMAX(xd->n4_w, xd->n4_h);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-  const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
-  const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
   int processed_rows = 0;
   int processed_cols = 0;
 
@@ -455,17 +511,16 @@ static void setup_ref_mv_list(
   if (xd->up_available) {
     max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
 
-    if (xd->n8_h < mi_size_high[BLOCK_8X8])
+    if (xd->n4_h < mi_size_high[BLOCK_8X8])
       max_row_offset = -(2 << 1) + row_adj;
 
-    max_row_offset =
-        find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
+    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
   }
 
   if (xd->left_available) {
     max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
 
-    if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+    if (xd->n4_w < mi_size_wide[BLOCK_8X8])
       max_col_offset = -(2 << 1) + col_adj;
 
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -487,12 +542,12 @@ static void setup_ref_mv_list(
                   gm_mv_candidates, max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
                   ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
                   gm_mv_candidates, &refmv_count[ref_frame]);
 
-  uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
-  uint8_t nearest_refmv_count = refmv_count[ref_frame];
+  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t nearest_refmv_count = refmv_count[ref_frame];
 
   // TODO(yunqing): for comp_search, do it for all 3 cases.
   for (int idx = 0; idx < nearest_refmv_count; ++idx)
@@ -500,27 +555,27 @@ static void setup_ref_mv_list(
 
   if (cm->allow_ref_frame_mvs) {
     int is_available = 0;
-    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
-    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
-    const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
-    const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+    const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
 
     const int tpl_sample_pos[3][2] = {
       { voffset, -2 },
       { voffset, hoffset },
       { voffset - 2, hoffset },
     };
-    const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
-                                (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
-                                (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
-                                (xd->n8_w < mi_size_wide[BLOCK_64X64]);
-
-    int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
-                     ? mi_size_high[BLOCK_16X16]
-                     : mi_size_high[BLOCK_8X8];
-    int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
-                     ? mi_size_wide[BLOCK_16X16]
-                     : mi_size_wide[BLOCK_8X8];
+    const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+                                (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+    const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+                           ? mi_size_high[BLOCK_16X16]
+                           : mi_size_high[BLOCK_8X8];
+    const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+                           ? mi_size_wide[BLOCK_16X16]
+                           : mi_size_wide[BLOCK_8X8];
 
     for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
       for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
@@ -569,7 +624,7 @@ static void setup_ref_mv_list(
                     max_col_offset, &processed_cols);
   }
 
-  uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
 
   switch (nearest_match) {
     case 0:
@@ -636,62 +691,24 @@ static void setup_ref_mv_list(
       int_mv ref_id[2][2], ref_diff[2][2];
       int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
 
-      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
       mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
       mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
       int mi_size = AOMMIN(mi_width, mi_height);
 
       for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_wide[candidate_bsize];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_wide[candidate->sb_type];
       }
 
       for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_high[candidate_bsize];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_high[candidate->sb_type];
       }
 
       // Build up the compound mv predictor
@@ -743,87 +760,37 @@ static void setup_ref_mv_list(
 
     for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
     }
   } else {
     // Handle single reference frame extension
-    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
     mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
     mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
     int mi_size = AOMMIN(mi_width, mi_height);
 
     for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
                       refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_wide[candidate_bsize];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_wide[candidate->sb_type];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
                       refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_high[candidate_bsize];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_high[candidate->sb_type];
     }
 
     for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
       clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
     }
 
     if (mv_ref_list != NULL) {
@@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
                                        : -((-mv.col) >> (4 + MI_SIZE_LOG2));
 
-  int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
-  int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+  const int row =
+      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  const int col =
+      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
   if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
       col >= (cm->mi_cols >> 1))
@@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   return 1;
 }
 
-static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
-                                   int dir) {
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+                                   MV_REFERENCE_FRAME start_frame, int dir) {
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int ref_offset[REF_FRAMES] = { 0 };
 
   (void)dir;
 
-  int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
-  if (ref_frame_idx < 0) return 0;
+  const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+  if (start_frame_idx < 0) return 0;
 
-  if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
 
-  if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
-      cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+      cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
     return 0;
 
-  int ref_frame_index =
-      cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
-  unsigned int *ref_rf_idx =
-      &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
-  int cur_frame_index = cm->cur_frame->cur_frame_offset;
-  int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+  const int start_frame_offset =
+      cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+  const unsigned int *const ref_frame_offsets =
+      &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+  const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+  int start_to_current_frame_offset =
+      get_relative_dist(cm, start_frame_offset, cur_frame_offset);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] =
-        get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
+    ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+                                       ref_frame_offsets[rf - LAST_FRAME]);
   }
 
-  if (dir == 2) ref_to_cur = -ref_to_cur;
+  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
 
-  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
   const int mvs_rows = (cm->mi_rows + 1) >> 1;
   const int mvs_cols = (cm->mi_cols + 1) >> 1;
 
@@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
         int mi_r, mi_c;
         const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
 
-        int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
-                        ref_frame_offset > 0 &&
-                        abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
+        int pos_valid =
+            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+            ref_frame_offset > 0 &&
+            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
 
         if (pos_valid) {
-          get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
-                            ref_frame_offset);
+          get_mv_projection(&this_mv.as_mv, fwd_mv,
+                            start_to_current_frame_offset, ref_frame_offset);
           pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
                                          this_mv.as_mv, dir >> 1);
         }
 
         if (pos_valid) {
-          int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+          const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
 
           tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
           tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
   if (up_available) {
     int mi_row_offset = -1;
     MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
-    uint8_t n8_w = mi_size_wide[mbmi->sb_type];
+    uint8_t n4_w = mi_size_wide[mbmi->sb_type];
 
-    if (xd->n8_w <= n8_w) {
+    if (xd->n4_w <= n4_w) {
       // Handle "current block width <= above block width" case.
-      int col_offset = -mi_col % n8_w;
+      int col_offset = -mi_col % n4_w;
 
       if (col_offset < 0) do_tl = 0;
-      if (col_offset + n8_w > xd->n8_w) do_tr = 0;
+      if (col_offset + n4_w > xd->n4_w) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+      for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
         int mi_col_offset = i;
         mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n8_w = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n8_w, n8_w);
+        n4_w = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_w, n4_w);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
     int mi_col_offset = -1;
 
     MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t n8_h = mi_size_high[mbmi->sb_type];
+    uint8_t n4_h = mi_size_high[mbmi->sb_type];
 
-    if (xd->n8_h <= n8_h) {
+    if (xd->n4_h <= n4_h) {
       // Handle "current block height <= above block height" case.
-      int row_offset = -mi_row % n8_h;
+      int row_offset = -mi_row % n4_h;
 
       if (row_offset < 0) do_tl = 0;
 
@@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+      for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
         int mi_row_offset = i;
         mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n8_h = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n8_h, n8_h);
+        n4_h = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_h, n4_h);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
   // Top-right block
   if (do_tr &&
-      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
-    POSITION trb_pos = { -1, xd->n8_w };
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+    POSITION trb_pos = { -1, xd->n4_w };
 
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
+    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
       int mi_row_offset = -1;
-      int mi_col_offset = xd->n8_w;
+      int mi_col_offset = xd->n4_w;
 
       MB_MODE_INFO *mbmi =
           xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
         np++;
         if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
@@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
 
 static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
                                REF_FRAME_INFO *ref_info) {
-  assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
+  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
   const int buf_idx = ref_info->buf_idx;
 
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index f68c159e1..83f7a1ac0 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_MVREF_COMMON_H_
-#define AV1_COMMON_MVREF_COMMON_H_
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
 
 #include "av1/common/onyxc_int.h"
 #include "av1/common/blockd.h"
@@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const POSITION *mi_pos) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
-    return !(mi_row + mi_pos->row < 0 ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= mi_rows ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  } else {
-    return !(mi_row + mi_pos->row < tile->mi_row_start ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= tile->mi_row_end ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  }
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
 static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
-                                        int mi_rows, int row_offset) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
-    return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
-  else
-    return clamp(row_offset, tile->mi_row_start - mi_row,
-                 tile->mi_row_end - mi_row - 1);
+                                        int row_offset) {
+  return clamp(row_offset, tile->mi_row_start - mi_row,
+               tile->mi_row_end - mi_row - 1);
 }
 
 static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
@@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
   }
 }
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
-                        int mi_row, int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis);
 
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
-#define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
 
 static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
                                    int mib_size, int mi_row, int mi_col) {
@@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
   const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
   if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
 
-#if USE_WAVE_FRONT
+  // Wavefront constraint: use only top left area of frame for reference.
   const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
   const int wf_offset = gradient * (active_sb_row - src_sb_row);
   if (src_sb_row > active_sb_row ||
       src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
     return 0;
-#endif
 
   return 1;
 }
@@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_MVREF_COMMON_H_
+#endif  // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index 3918c82c6..1c90cd93f 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_OBMC_H_
-#define AV1_COMMON_OBMC_H_
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
 
 typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
                                           uint8_t nb_mi_size,
@@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
   MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
-  const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
+  const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
@@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
-      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
+      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
           fun_ctxt, num_planes);
     }
   }
@@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
   MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
-  const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
+  const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
@@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
-      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
+      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
           fun_ctxt, num_planes);
     }
   }
 }
 
-#endif  // AV1_COMMON_OBMC_H_
+#endif  // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 000000000..823b700b1
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size = 0, obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (is_annexb) {
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(data + obu_header->size,
+                           bytes_available - obu_header->size, payload_size,
+                           &length_field_size);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read = length_field_size + obu_header->size;
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index e87c5a0bf..e1db0f44d 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -11,8 +11,8 @@
 
 /* clang-format off */
 
-#ifndef AV1_COMMON_ODINTRIN_H_
-#define AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
 
 #include <stdlib.h>
 #include <string.h>
@@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_MAXI AOMMAX
 #define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
 
-#define OD_CLZ0 (1)
-#define OD_CLZ(x) (-get_msb(x))
-#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+  OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
 
 /*Enable special features for gcc and compatible compilers.*/
 #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ODINTRIN_H_
+#endif  // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 6b1bf2d74..ff011c89e 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_ONYXC_INT_H_
-#define AV1_COMMON_ONYXC_INT_H_
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
 
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
@@ -480,6 +480,7 @@ typedef struct AV1Common {
 
   int byte_alignment;
   int skip_loop_filter;
+  int skip_film_grain;
 
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
@@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
     xd->chroma_left_mbmi = chroma_left_mi;
   }
 
-  xd->n8_h = bh;
-  xd->n8_w = bw;
+  xd->n4_h = bh;
+  xd->n4_w = bw;
   xd->is_sec_rect = 0;
-  if (xd->n8_w < xd->n8_h) {
+  if (xd->n4_w < xd->n4_h) {
     // Only mark is_sec_rect as 1 for the last block.
     // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
     // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+    if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
   }
 
-  if (xd->n8_w > xd->n8_h)
-    if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
+  if (xd->n4_w > xd->n4_h)
+    if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
 }
 
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
 }
 
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
                                  const MACROBLOCKD *xd) {
   uint8_t bw = tx_size_wide[tx_size];
   uint8_t bh = tx_size_high[tx_size];
 
   if (skip) {
-    bw = n8_w * MI_SIZE;
-    bh = n8_h * MI_SIZE;
+    bw = n4_w * MI_SIZE;
+    bh = n4_h * MI_SIZE;
   }
 
-  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
-  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 }
 
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
@@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_ONYXC_INT_H_
+#endif  // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
index 58933a7b3..026a07809 100644
--- a/third_party/aom/av1/common/ppc/cfl_ppc.c
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -24,19 +24,21 @@
 #define CFL_LINE_2 128
 #define CFL_LINE_3 192
 
-typedef vector int8_t int8x16_t;
-typedef vector uint8_t uint8x16_t;
-typedef vector int16_t int16x8_t;
-typedef vector uint16_t uint16x8_t;
-typedef vector int32_t int32x4_t;
-typedef vector uint32_t uint32x4_t;
-typedef vector uint64_t uint64x2_t;
+typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
+typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
+typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
 
-static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
-                                        int height, int round_offset,
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+                                        int width, int height, int round_offset,
                                         int num_pel_log2) {
-  const int16_t *end = pred_buf + height * CFL_BUF_LINE;
-  const int16_t *sum_buf = pred_buf;
+  //  int16_t *dst = dst_ptr;
+  const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = (const int16_t *)src_ptr;
+  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
   const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
   const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
                                0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
   const int32x4_t avg = vec_sr(sum_32x4, div_shift);
   const int16x8_t vec_avg = vec_pack(avg, avg);
   do {
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
-               OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
-               OFF_0 + CFL_LINE_2, pred_buf);
-    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
-               OFF_0 + CFL_LINE_3, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+               OFF_0 + CFL_LINE_2, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+               OFF_0 + CFL_LINE_3, dst);
     if (width >= 16) {
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_1 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+                 OFF_1 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+                 OFF_1 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+                 OFF_1 + CFL_LINE_3, dst);
     }
     if (width == 32) {
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_2 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+                 OFF_2 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+                 OFF_2 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+                 OFF_2 + CFL_LINE_3, dst);
 
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
-                 pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_1, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_2, pred_buf);
-      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
-                 OFF_3 + CFL_LINE_3, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+                 OFF_3 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+                 OFF_3 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+                 OFF_3 + CFL_LINE_3, dst);
     }
-  } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+  } while ((dst += CFL_BUF_LINE * 4) < dst_end);
 }
 
 // Declare wrappers for VSX sizes
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index d77739d85..5952441d1 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx_offset =
       (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
-  MV_REFERENCE_FRAME ref_frame =
-      (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+  assert(dir == 0 || dir == 1);
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index 6a835c467..6dba2322d 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_PRED_COMMON_H_
-#define AV1_COMMON_PRED_COMMON_H_
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
 
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
@@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_PRED_COMMON_H_
+#endif  // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index ca199e94c..d1f52a660 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_QUANT_COMMON_H_
-#define AV1_COMMON_QUANT_COMMON_H_
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
 
 #include "aom/aom_codec.h"
 #include "av1/common/seg_common.h"
@@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_QUANT_COMMON_H_
+#endif  // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b9f0b57f3..3203efce4 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
 
   if (build_for_obmc) return 0;
 
-  if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
     if (final_warp_params != NULL)
-      memcpy(final_warp_params, &mbmi->wm_params[0],
-             sizeof(*final_warp_params));
+      memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
     return 1;
   } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
     if (final_warp_params != NULL)
@@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
        av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
                       build_for_obmc, subpel_params->xs, subpel_params->ys,
                       &final_warp_params));
+  const int is_intrabc = mi->use_intrabc;
+  assert(IMPLIES(is_intrabc, !do_warp));
+
   if (do_warp && xd->cur_frame_force_integer_mv == 0) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                    pd->subsampling_x, pd->subsampling_y, conv_params);
   } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
-                           w, h, conv_params, interp_filters, xd->bd);
+                           w, h, conv_params, interp_filters, is_intrabc,
+                           xd->bd);
   } else {
     inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
-                    conv_params, interp_filters);
+                    conv_params, interp_filters, is_intrabc);
   }
 }
 
@@ -574,37 +577,6 @@ static void build_masked_compound_no_round(
                                  h, subw, subh, conv_params);
 }
 
-static void build_masked_compound(
-    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
-    const uint8_t *src1, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
-  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, block_size_wide[sb_type], w, h, subw, subh);
-}
-
-static void build_masked_compound_highbd(
-    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
-    const uint8_t *src1_8, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w, int bd) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
-  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  // const uint8_t *mask =
-  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
-  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, block_size_wide[sb_type], w, h,
-                            subw, subh, bd);
-}
-
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
     const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
@@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor(
                                  mi->sb_type, h, w, conv_params, xd);
 }
 
-// TODO(sarahparker) av1_highbd_build_inter_predictor and
-// av1_build_inter_predictor should be combined with
-// av1_make_inter_predictor
-void av1_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
-    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
-    int p_row, int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd, int can_use_previous) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  mv.col += SCALE_EXTRA_OFF;
-  mv.row += SCALE_EXTRA_OFF;
-  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                       mv.col & SCALE_SUBPEL_MASK,
-                                       mv.row & SCALE_SUBPEL_MASK };
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-
-  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
-         (mv.col >> SCALE_SUBPEL_BITS);
-
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
-                           w, h, &conv_params, interp_filters, warp_types,
-                           p_col, p_row, plane, ref, xd->mi[0], 0, xd,
-                           can_use_previous);
-}
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  mv.col += SCALE_EXTRA_OFF;
-  mv.row += SCALE_EXTRA_OFF;
-
-  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                       mv.col & SCALE_SUBPEL_MASK,
-                                       mv.row & SCALE_SUBPEL_MASK };
-  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
-         (mv.col >> SCALE_SUBPEL_BITS);
-
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
-                           w, h, conv_params, interp_filters, warp_types, p_col,
-                           p_row, plane, ref, xd->mi[0], 0, xd,
-                           can_use_previous);
-}
-
 void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
                                 int order_idx, int *fwd_offset, int *bck_offset,
                                 int *use_jnt_comp_avg, int is_compound) {
@@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
   *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
 }
 
-static INLINE void calc_subpel_params(
-    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
-    int plane, const int pre_x, const int pre_y, int x, int y,
-    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
-    int bw, int bh) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int is_scaled = av1_is_scaled(sf);
-  if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
-    orig_pos_x += mv.col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
-    pos_x += SCALE_EXTRA_OFF;
-    pos_y += SCALE_EXTRA_OFF;
-
-    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                       << SCALE_SUBPEL_BITS;
-    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-    pos_y = clamp(pos_y, top, bottom);
-    pos_x = clamp(pos_x, left, right);
-
-    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-           (pos_x >> SCALE_SUBPEL_BITS);
-    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
-    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
-    subpel_params->xs = sf->x_step_q4;
-    subpel_params->ys = sf->y_step_q4;
-  } else {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
-    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-           (x + (mv_q4.col >> SUBPEL_BITS));
-  }
-}
-
-static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int plane, const MB_MODE_INFO *mi,
-                                          int build_for_obmc, int bw, int bh,
-                                          int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int is_compound = has_second_ref(mi);
-  int ref;
-  const int is_intrabc = is_intrabc_block(mi);
-  assert(IMPLIES(is_intrabc, !is_compound));
-  int is_global[2] = { 0, 0 };
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
-  }
-
-  const BLOCK_SIZE bsize = mi->sb_type;
-  const int ss_x = pd->subsampling_x;
-  const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
-                     (block_size_high[bsize] < 8 && ss_y);
-
-  if (is_intrabc) sub8x8_inter = 0;
-
-  // For sub8x8 chroma blocks, we may be covering more than one luma block's
-  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
-  // the top-left corner of the prediction source - the correct top-left corner
-  // is at (pre_x, pre_y).
-  const int row_start =
-      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
-  const int col_start =
-      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
-  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
-  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
-  sub8x8_inter = sub8x8_inter && !build_for_obmc;
-  if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
-      for (int col = col_start; col <= 0; ++col) {
-        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
-        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
-      }
-    }
-  }
-
-  if (sub8x8_inter) {
-    // block size
-    const int b4_w = block_size_wide[bsize] >> ss_x;
-    const int b4_h = block_size_high[bsize] >> ss_y;
-    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
-    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
-    const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    assert(!is_compound);
-
-    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
-    int row = row_start;
-    for (int y = 0; y < b8_h; y += b4_h) {
-      int col = col_start;
-      for (int x = 0; x < b8_w; x += b4_w) {
-        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-        is_compound = has_second_ref(this_mbmi);
-        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
-        int tmp_dst_stride = 8;
-        assert(bw < 8 || bh < 8);
-        ConvolveParams conv_params = get_conv_params_no_round(
-            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
-        conv_params.use_jnt_comp_avg = 0;
-        struct buf_2d *const dst_buf = &pd->dst;
-        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
-        ref = 0;
-        const RefBuffer *ref_buf =
-            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
-        pd->pre[ref].buf0 =
-            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
-        pd->pre[ref].buf =
-            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
-                                                     ref_buf->buf->uv_stride,
-                                                     &ref_buf->sf);
-        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
-        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
-        pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
-        const struct scale_factors *const sf =
-            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
-        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
-        const MV mv = this_mbmi->mv[ref].as_mv;
-
-        uint8_t *pre;
-        SubpelParams subpel_params;
-        WarpTypesAllowed warp_types;
-        warp_types.global_warp_allowed = is_global[ref];
-        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
-        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                           &subpel_params, bw, bh);
-
-        conv_params.ref = ref;
-        conv_params.do_average = ref;
-        if (is_masked_compound_type(mi->interinter_comp.type)) {
-          // masked compound type has its own average mechanism
-          conv_params.do_average = 0;
-        }
-
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
-            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
-            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
-            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
-        ++col;
-      }
-      ++row;
-    }
-
-    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
-    return;
-  }
-
-  {
-    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-    ConvolveParams conv_params = get_conv_params_no_round(
-        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
-    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
-                               &conv_params.bck_offset,
-                               &conv_params.use_jnt_comp_avg, is_compound);
-
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf;
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf =
-          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-      const MV mv = mi->mv[ref].as_mv;
-
-      uint8_t *pre;
-      SubpelParams subpel_params;
-      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
-                         &subpel_params, bw, bh);
-
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global[ref];
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-      conv_params.ref = ref;
-
-      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
-        // masked compound type has its own average mechanism
-        conv_params.do_average = 0;
-        av1_make_masked_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, plane, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
-            cm->allow_warped_motion);
-      } else {
-        conv_params.do_average = ref;
-        av1_make_inter_predictor(
-            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
-            bh, &conv_params, mi->interp_filters, &warp_types,
-            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
-            mi, build_for_obmc, xd, cm->allow_warped_motion);
-      }
-    }
-  }
-}
-
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col,
-                                              int plane_from, int plane_to) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = pd->width;
-    const int bh = pd->height;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
-
-    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
-  }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
-                               { xd->plane[0].dst.stride, 0, 0 } };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride, ctx, 0, bsize);
-  }
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
-                                    MAX_MB_PLANE - 1);
-
-  if (is_interintra_pred(xd->mi[0])) {
-    BUFFER_SET default_ctx = {
-      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
-      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
-    };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors_sbuv(
-        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
-  }
-}
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize) {
-  const int num_planes = av1_num_planes(cm);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  if (num_planes > 1)
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
-}
-
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int plane_start, const int plane_end) {
@@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred(
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
   xd->mb_to_right_edge = ctxt->mb_to_far_edge +
-                         (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
-    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *above_mbmi;
-
-  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
-                                           above_mbmi, ctxt, num_planes);
-  mi_x = above_mi_col << MI_SIZE_LOG2;
-  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
-
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
-  for (int j = 0; j < num_planes; ++j) {
-    const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
-    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
-                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
-
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
-  }
-  *above_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]) {
-  if (!xd->up_available) return;
-
-  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
-  // prediction block. This is half the height of the original block,
-  // except for 128-wide blocks, where we only use a height of 32.
-  int this_height = xd->n8_h * MI_SIZE;
-  int pred_height = AOMMIN(this_height / 2, 32);
-  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                build_prediction_by_above_pred, &ctxt);
-
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
-  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+                         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
 }
 
 void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
   xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
-      (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
-    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
-  int mi_x, mi_y;
-  MB_MODE_INFO backup_mbmi = *left_mbmi;
-
-  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
-                                          left_mbmi, ctxt, num_planes);
-  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
-  mi_y = left_mi_row << MI_SIZE_LOG2;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
-  for (int j = 0; j < num_planes; ++j) {
-    const struct macroblockd_plane *pd = &xd->plane[j];
-    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
-                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
-    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
-  }
-  *left_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
-                                        uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        int tmp_width[MAX_MB_PLANE],
-                                        int tmp_height[MAX_MB_PLANE],
-                                        int tmp_stride[MAX_MB_PLANE]) {
-  if (!xd->left_available) return;
-
-  // Adjust mb_to_right_edge to have the correct value for the OBMC
-  // prediction block. This is half the width of the original block,
-  // except for 128-wide blocks, where we only use a width of 32.
-  int this_width = xd->n8_w * MI_SIZE;
-  int pred_width = AOMMIN(this_width / 2, 32);
-  xd->mb_to_right_edge += (this_width - pred_width) * 8;
-
-  struct build_prediction_ctxt ctxt = { cm,         mi_row,
-                                        mi_col,     tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               build_prediction_by_left_pred, &ctxt);
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
-  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
-}
-
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col) {
-  const int num_planes = av1_num_planes(cm);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
-  } else {
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
-    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
-    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-  }
-  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_width1, dst_height1, dst_stride1);
-  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
-                       mi_row, mi_col, 0, num_planes);
-  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
-                                  dst_buf2, dst_stride2);
+      (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
 }
 
 /* clang-format off */
@@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
   av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
   av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
 }
-
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     uint8_t *ypred, uint8_t *upred,
-                                     uint8_t *vpred, int ystride, int ustride,
-                                     int vstride, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
-  av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
-                                       ctx, bsize);
-}
-
-// Builds the inter-predictor for the single ref case
-// for use in the encoder to search the wedges efficiently.
-static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
-                                              int bw, int bh, int x, int y,
-                                              int w, int h, int mi_x, int mi_y,
-                                              int ref, uint8_t *const ext_dst,
-                                              int ext_dst_stride,
-                                              int can_use_previous) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO *mi = xd->mi[0];
-
-  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-  struct buf_2d *const pre_buf = &pd->pre[ref];
-  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
-  const MV mv = mi->mv[ref].as_mv;
-
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-  WarpTypesAllowed warp_types;
-  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
-  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
-  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-  const int pre_x = (mi_x) >> pd->subsampling_x;
-  const int pre_y = (mi_y) >> pd->subsampling_y;
-  uint8_t *pre;
-  SubpelParams subpel_params;
-  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
-                     &subpel_params, bw, bh);
-
-  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
-                           &subpel_params, sf, w, h, &conv_params,
-                           mi->interp_filters, &warp_types, pre_x + x,
-                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
-}
-
-void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
-                                      mi_y, ref, ext_dst[plane],
-                                      ext_dst_stride[plane], can_use_previous);
-  }
-}
-
-static void build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
-    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int is_compound = has_second_ref(mbmi);
-  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
-  struct buf_2d *const dst_buf = &pd->dst;
-  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  mbmi->interinter_comp.seg_mask = xd->seg_mask;
-  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
-
-  if (is_compound && is_masked_compound_type(comp_data->type)) {
-    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        av1_build_compound_diffwtd_mask_highbd(
-            comp_data->seg_mask, comp_data->mask_type,
-            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
-      else
-        av1_build_compound_diffwtd_mask(
-            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
-            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
-    }
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_highbd(
-          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
-          mbmi->sb_type, h, w, xd->bd);
-    else
-      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
-                            h, w);
-  } else {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
-                               xd->bd);
-    else
-      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
-                        0, NULL, 0, w, h);
-  }
-}
-
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]) {
-  int plane;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
-    build_wedge_inter_predictor_from_buf(
-        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
-        ext_dst1[plane], ext_dst_stride1[plane]);
-  }
-}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 6a3def270..db86c777e 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RECONINTER_H_
-#define AV1_COMMON_RECONINTER_H_
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
 
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
@@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                    const SubpelParams *subpel_params,
                                    const struct scale_factors *sf, int w, int h,
                                    ConvolveParams *conv_params,
-                                   InterpFilters interp_filters) {
+                                   InterpFilters interp_filters,
+                                   int is_intrabc) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
-  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf);
+                           subpel_params->ys, 1, conv_params, sf, is_intrabc);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf);
+                           sp.ys, 0, conv_params, sf, is_intrabc);
   }
 }
 
-static INLINE void highbd_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const SubpelParams *subpel_params,
+                                          const struct scale_factors *sf, int w,
+                                          int h, ConvolveParams *conv_params,
+                                          InterpFilters interp_filters,
+                                          int is_intrabc, int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
-  if (has_scale(subpel_params->xs, subpel_params->ys)) {
-    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                  interp_filters, subpel_params->subpel_x,
-                                  subpel_params->xs, subpel_params->subpel_y,
-                                  subpel_params->ys, 1, conv_params, sf, bd);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters,
+        subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+        subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
-    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                  interp_filters, sp.subpel_x, sp.xs,
-                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+        sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
   }
 }
 
@@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize);
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *src_mv,
-                               const struct scale_factors *sf, int w, int h,
-                               ConvolveParams *conv_params,
-                               InterpFilters interp_filters,
-                               const WarpTypesAllowed *warp_types, int p_col,
-                               int p_row, int plane, int ref,
-                               enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
-    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
-    int p_row, int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd, int can_use_previous);
-
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                        const struct scale_factors *sf) {
   const int x =
@@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf, const int num_planes);
 
-// Detect if the block have sub-pixel level motion vectors
-// per component.
-#define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
-                                          const MACROBLOCKD *const xd,
-                                          int dir) {
-#if CHECK_SUBPEL
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int plane;
-  int ref = (dir >> 1);
-
-  if (dir & 0x01) {
-    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
-  } else {
-    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
-  }
-
-  return 0;
-#else
-  (void)mbmi;
-  (void)xd;
-  (void)dir;
-  return 1;
-#endif
-}
-
 static INLINE void set_default_interp_filters(
     MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
   mbmi->interp_filters =
@@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
   return 1;
 }
 
-static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
-  MB_MODE_INFO *const mi = xd->mi[0];
-  const int is_compound = has_second_ref(mi);
-  int ref;
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    int row_col;
-    for (row_col = 0; row_col < 2; ++row_col) {
-      const int dir = (ref << 1) + row_col;
-      if (has_subpel_mv_component(mi, xd, dir)) {
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
                                              MB_MODE_INFO *left_mbmi,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes);
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
-                                        uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        int tmp_width[MAX_MB_PLANE],
-                                        int tmp_height[MAX_MB_PLANE],
-                                        int tmp_stride[MAX_MB_PLANE]);
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
@@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 const uint8_t *av1_get_obmc_mask(int length);
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col);
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col);
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     uint8_t *ypred, uint8_t *upred,
-                                     uint8_t *vpred, int ystride, int ustride,
-                                     int vstride, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *pred, int stride,
@@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-// Encoder only
-void av1_build_inter_predictors_for_planes_single_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
-    int can_use_previous);
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]);
-
 void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
                                 int order_idx, int *fwd_offset, int *bck_offset,
                                 int *use_jnt_comp_avg, int is_compound);
@@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_RECONINTER_H_
+#endif  // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 57638f24e..07853aba0 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RECONINTRA_H_
-#define AV1_COMMON_RECONINTRA_H_
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
 
 #include <stdlib.h>
 
@@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AV1_COMMON_RECONINTRA_H_
+#endif  // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 93d62292a..d61a20aa2 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
   { -1, 3, -9, 17, 112, 10, -7, 3 },  { -1, 3, -8, 15, 112, 12, -7, 2 },
 };
 
-// Filters for interpolation (full-band) - no filtering for integer pixels
-static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
-  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
-  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
-  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
-  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
-  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
-  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
-  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
-  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
-  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
-  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
-  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
-  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
-  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
-  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
-  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
-  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
-  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
-  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
-  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
-  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
-  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
-  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
-  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
-  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
-  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
-  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
-  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
-  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
-  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
-  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
-  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
-};
-
 const int16_t av1_resize_filter_normative[(
     1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
 #if UPSCALE_NORMATIVE_TAPS == 8
@@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[(
 #endif  // UPSCALE_NORMATIVE_TAPS == 8
 };
 
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
 // Filters for factor of 2 downsampling.
 static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
 static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index feec3a90e..9a59a8d63 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_RESIZE_H_
-#define AV1_ENCODER_RESIZE_H_
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
 
 #include <stdio.h>
 #include "aom/aom_integer.h"
@@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_RESIZE_H_
+#endif  // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 632967957..d276a915b 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = {
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
 };
 
-static void selfguided_restoration_fast_internal(
-    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
-    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+                                          int dgd_stride, int bit_depth,
+                                          int sgr_params_idx, int radius_idx,
+                                          int pass, int32_t *A, int32_t *B) {
   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
@@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal(
   // We also align the stride to a multiple of 16 bytes, for consistency
   // with the SIMD version of this function.
   int buf_stride = ((width_ext + 3) & ~3) + 16;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
+  const int step = pass == 0 ? 1 : 2;
   int i, j;
 
   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
@@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal(
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
-  for (i = -1; i < height + 1; i += 2) {
+  for (i = -1; i < height + 1; i += step) {
     for (j = -1; j < width + 1; ++j) {
       const int k = i * buf_stride + j;
       const int n = (2 * r + 1) * (2 * r + 1);
@@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal(
                                          SGRPROJ_RECIP_BITS);
     }
   }
+}
+
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 1, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
   // Use the A[] and B[] arrays to calculate the filtered image
+  (void)r;
   assert(r == 2);
   for (i = 0; i < height; ++i) {
     if (!(i & 1)) {  // even row
@@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
                                             int dst_stride, int bit_depth,
                                             int sgr_params_idx,
                                             int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
-  const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
-  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes, for consistency
@@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
   int32_t *A = A_;
   int32_t *B = B_;
   int i, j;
-
-  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
-  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
-         "Need SGRPROJ_BORDER_* >= r+1");
-
-  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
-         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
-  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
-         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 0, A, B);
   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
-  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
-  for (i = -1; i < height + 1; ++i) {
-    for (j = -1; j < width + 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int n = (2 * r + 1) * (2 * r + 1);
-
-      // a < 2^16 * n < 2^22 regardless of bit depth
-      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
-      // b < 2^8 * n < 2^14 regardless of bit depth
-      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
-
-      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
-      // and p itself satisfies p < 2^14 * n^2 < 2^26.
-      // This bound on p is due to:
-      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
-      //
-      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
-      // This is an artefact of rounding, and can only happen if all pixels
-      // are (almost) identical, so in this case we saturate to p=0.
-      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-
-      const uint32_t s = params->s[radius_idx];
-
-      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
-      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
-      // (this holds even after accounting for the rounding in s)
-      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
-
-      // Note: We have to be quite careful about the value of A[k].
-      // This is used as a blend factor between individual pixel values and the
-      // local mean. So it logically has a range of [0, 256], including both
-      // endpoints.
-      //
-      // This is a pain for hardware, as we'd like something which can be stored
-      // in exactly 8 bits.
-      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
-      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
-      // slightly above 2^(8 + bit depth), due to rounding in the value of
-      // one_by_x[25-1].
-      //
-      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
-      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
-      // overflow), without significantly affecting the final result: z == 0
-      // implies that the image is essentially "flat", so the local mean and
-      // individual pixel values are very similar.
-      //
-      // Note that saturating on the other side, ie. requring A[k] <= 255,
-      // would be a bad idea, as that corresponds to the case where the image
-      // is very variable, when we want to preserve the local pixel value as
-      // much as possible.
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
 
-      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
-      // one_by_x[n - 1] = round(2^12 / n)
-      // => the product here is < 2^(20 + bit_depth) <= 2^32,
-      // and B[k] is set to a value < 2^(8 + bit depth)
-      // This holds even with the rounding in one_by_x and in the overall
-      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
-      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
-                                             (uint32_t)B[k] *
-                                             (uint32_t)one_by_x[n - 1],
-                                         SGRPROJ_RECIP_BITS);
-    }
-  }
   // Use the A[] and B[] arrays to calculate the filtered image
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
@@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
   }
 }
 
-void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
-                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
-                                  int flt_stride, int sgr_params_idx,
-                                  int bit_depth, int highbd) {
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride, int sgr_params_idx,
+                                 int bit_depth, int highbd) {
   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
   int32_t *dgd32 =
@@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
   if (params->r[1] > 0)
     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
                                     flt_stride, bit_depth, sgr_params_idx, 1);
+  return 0;
 }
 
 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
@@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
 
-  av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
-                               eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_c(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index aec37d834..d834f9270 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_RESTORATION_H_
-#define AV1_COMMON_RESTORATION_H_
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
 
 #include "aom_ports/mem.h"
 #include "config/aom_config.h"
@@ -120,6 +120,7 @@ extern "C" {
 // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
 // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
 #define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
 
 #define WIENER_FILT_PREC_BITS 7
 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
@@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_RESTORATION_H_
+#endif  // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 5f02fdb81..748e958c3 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -9,12 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SCALE_H_
-#define AV1_COMMON_SCALE_H_
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
 
 #include "av1/common/convolve.h"
 #include "av1/common/mv.h"
-#include "aom_dsp/aom_convolve.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SCALE_H_
+#endif  // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index d206586b5..233dc0efa 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SCAN_H_
-#define AV1_COMMON_SCAN_H_
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SCAN_H_
+#endif  // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index c851d65fd..8c35bba86 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SEG_COMMON_H_
-#define AV1_COMMON_SEG_COMMON_H_
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
 
 #include "aom_dsp/prob.h"
 
@@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_SEG_COMMON_H_
+#endif  // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index f9b734b8c..8df4c9a09 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows(
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                           lf_data->xd, lf_sync);
   return 1;
@@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-    worker->hook = (AVxWorkerHook)loop_filter_row_worker;
+    worker->hook = loop_filter_row_worker;
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
 }
 
 // Implement row loop restoration for each thread.
-static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
-                                       LRWorkerData *lrworkerdata) {
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
   AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
   int lr_unit_row;
@@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   int num_rows_lr = 0;
 
   for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
     const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int max_tile_h = tile_rect.bottom - tile_rect.top;
 
-    const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    const int unit_size = cm->rst_info[plane].restoration_unit_size;
 
     num_rows_lr =
         AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
@@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
-    worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+    worker->hook = loop_restoration_row_worker;
     worker->data1 = lr_sync;
     worker->data2 = &lr_sync->lrworkerdata[i];
 
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 4b0d5d2b8..23d61d72a 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
-#define AV1_COMMON_LOOPFILTER_THREAD_H_
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
 
 #include "config/aom_config.h"
 
@@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_LOOPFILTER_THREAD_H_
+#endif  // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 026c904b6..1b413487f 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_cols;
+}
+
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
   // Round the frame up to a whole number of max superblocks
   mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be037fb17..c03553dc6 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_TILE_COMMON_H_
-#define AV1_COMMON_TILE_COMMON_H_
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
 // tiles horizontally or vertically in the frame.
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
 typedef struct {
   int left, top, right, bottom;
 } AV1PixelRect;
@@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm);
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_TILE_COMMON_H_
+#endif  // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index 1749baa57..06939ae43 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_TIMING_H_
-#define AOM_TIMING_H_
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
 
 #include "aom/aom_integer.h"
 #include "av1/common/enums.h"
@@ -56,4 +56,4 @@ void set_resource_availability_parameters(
 int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
                           int seq_tier);
 
-#endif  // AOM_TIMING_H_
+#endif  // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index 9a6b454ac..53e956450 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
 #include "config/aom_config.h"
 
 #include "av1/common/entropy.h"
@@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
                                         { AOM_CDF3(10923, 21845) },
                                         { AOM_CDF3(10923, 21845) },
                                         { AOM_CDF3(10923, 21845) } } } } };
+
+#endif  // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index f0ab79d0f..1dda51f8b 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_TXB_COMMON_H_
-#define AV1_COMMON_TXB_COMMON_H_
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
 
 extern const int16_t k_eob_group_start[12];
 extern const int16_t k_eob_offset_bits[12];
@@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = {
   0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
 };
 
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
-  /* clang-format off*/
-  { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
-  { 0, 2 },  { 1, -1 },  { 1, 0 },  { 1, 1 },  { 2, 0 }
-  /* clang-format on*/
-};
-
-#define CONTEXT_MAG_POSITION_NUM 3
-static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
-  { { 0, 1 }, { 1, 0 }, { 1, 1 } },
-  { { 0, 1 }, { 1, 0 }, { 0, 2 } },
-  { { 0, 1 }, { 1, 0 }, { 2, 0 } }
-};
-static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
-  { 0, 1 }, { 1, 0 }, { 1, 1 }
-};
-
 static const TX_CLASS tx_type_to_class[TX_TYPES] = {
   TX_CLASS_2D,     // DCT_DCT
   TX_CLASS_2D,     // ADST_DCT
@@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = {
   TX_CLASS_HORIZ,  // H_FLIPADST
 };
 
-static const int8_t eob_to_pos_small[33] = {
-  0, 1, 2,                                        // 0-2
-  3, 3,                                           // 3-4
-  4, 4, 4, 4,                                     // 5-8
-  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
-  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
-};
-
-static const int8_t eob_to_pos_large[17] = {
-  6,                               // place holder
-  7,                               // 33-64
-  8,  8,                           // 65-128
-  9,  9,  9,  9,                   // 129-256
-  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
-  11                               // 513-
-};
-
-static INLINE int get_eob_pos_token(const int eob, int *const extra) {
-  int t;
-
-  if (eob < 33) {
-    t = eob_to_pos_small[eob];
-  } else {
-    const int e = AOMMIN((eob - 1) >> 5, 16);
-    t = eob_to_pos_large[e];
-  }
-
-  *extra = eob - k_eob_group_start[t];
-
-  return t;
-}
-
-static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
-                                      const int eob_token) {
-  static const int8_t tx_type_to_offset[TX_TYPES] = {
-    -1,  // DCT_DCT
-    -1,  // ADST_DCT
-    -1,  // DCT_ADST
-    -1,  // ADST_ADST
-    -1,  // FLIPADST_DCT
-    -1,  // DCT_FLIPADST
-    -1,  // FLIPADST_FLIPADST
-    -1,  // ADST_FLIPADST
-    -1,  // FLIPADST_ADST
-    -1,  // IDTX
-    10,  // V_DCT
-    10,  // H_DCT
-    10,  // V_ADST
-    10,  // H_ADST
-    10,  // V_FLIPADST
-    10,  // H_FLIPADST
-  };
-  return eob_token + tx_type_to_offset[tx_type];
-}
-
 static INLINE int get_txb_bwl(TX_SIZE tx_size) {
   tx_size = av1_get_adjusted_tx_size(tx_size);
   return tx_size_wide_log2[tx_size];
@@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) {
   return tx_size_high[tx_size];
 }
 
-static INLINE void get_base_count_mag(int *mag, int *count,
-                                      const tran_low_t *tcoeffs, int bwl,
-                                      int height, int row, int col) {
-  mag[0] = 0;
-  mag[1] = 0;
-  for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0;
-  for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
-    const int ref_row = row + base_ref_offset[idx][0];
-    const int ref_col = col + base_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    // count
-    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-      count[i] += abs_coeff > i;
-    }
-    // mag
-    if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
-}
-
 static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
   return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
 }
@@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) {
   return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
 }
 
-static INLINE int get_level_count(const uint8_t *const levels, const int stride,
-                                  const int row, const int col, const int level,
-                                  const int (*nb_offset)[2], const int nb_num) {
-  int count = 0;
-
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    const int pos = ref_row * stride + ref_col;
-    count += levels[pos] > level;
-  }
-  return count;
-}
-
-static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
-                                 const int row, const int col, int *const mag) {
-  for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
-    const int ref_row = row + mag_ref_offset[idx][0];
-    const int ref_col = col + mag_ref_offset[idx][1];
-    const int pos = ref_row * stride + ref_col;
-    mag[idx] = levels[pos];
-  }
-}
-
 static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
                                               int sig_mag) {
   const int ctx = base_level_count_to_index[count];
@@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
   return ctx_idx;
 }
 
-static INLINE int get_base_ctx(const uint8_t *const levels,
-                               const int c,  // raster order
-                               const int bwl, const int level_minus_1,
-                               const int count) {
-  const int row = c >> bwl;
-  const int col = c - (row << bwl);
-  const int stride = (1 << bwl) + TX_PAD_HOR;
-  int mag_count = 0;
-  int nb_mag[3] = { 0 };
-
-  get_level_mag(levels, stride, row, col, nb_mag);
-
-  for (int idx = 0; idx < 3; ++idx)
-    mag_count += nb_mag[idx] > (level_minus_1 + 1);
-  const int ctx_idx =
-      get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
-  return ctx_idx;
-}
-
-#define BR_CONTEXT_POSITION_NUM 8  // Base range coefficient context
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
-  /* clang-format off*/
-  { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
-  { 0, 1 },   { 1, -1 }, { 1, 0 },  { 1, 1 },
-  /* clang-format on*/
-};
-
-static const int br_level_map[9] = {
-  0, 0, 1, 1, 2, 2, 3, 3, 3,
-};
-
-// Note: If BR_MAG_OFFSET changes, the calculation of offset in
-// get_br_ctx_from_count_mag() must be updated.
-#define BR_MAG_OFFSET 1
-// TODO(angiebird): optimize this function by using a table to map from
-// count/mag to ctx
-
-static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
-                                   int height, int row, int col, int level) {
-  mag[0] = 0;
-  mag[1] = 0;
-  int count = 0;
-  for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) {
-    const int ref_row = row + br_ref_offset[idx][0];
-    const int ref_col = col + br_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
-    if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
-  return count;
-}
-
-static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
-                                            const int count, const int mag) {
-  // DC: 0 - 1
-  // Top row: 2 - 4
-  // Left column: 5 - 7
-  // others: 8 - 11
-  static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
-  const int mag_clamp = AOMMIN(mag, 6);
-  const int offset = mag_clamp >> 1;
-  const int ctx =
-      br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
-  return ctx;
-}
-
 static INLINE int get_br_ctx_2d(const uint8_t *const levels,
                                 const int c,  // raster order
                                 const int bwl) {
@@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
   return mag + 14;
 }
 
-#define SIG_REF_OFFSET_NUM 5
-
-// Note: TX_PAD_2D is dependent to these offset tables.
-static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
-  { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
-  // , { 1, 2 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
-  { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
-  // , { 1, 1 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
-  { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
-  // , { 1, 1 }, { 1, 2 },
-};
-
-#define SIG_REF_DIFF_OFFSET_NUM 3
-
-static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 1, 1 }, { 0, 2 }, { 2, 0 }
-};
-
-static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 2, 0 }, { 3, 0 }, { 4, 0 }
-};
-
-static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
-  { 0, 2 }, { 0, 3 }, { 0, 4 }
-};
-
 static const uint8_t clip_max3[256] = {
   0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
 
 void av1_init_lv_map(AV1_COMMON *cm);
 
-#endif  // AV1_COMMON_TXB_COMMON_H_
+#endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 412d83ed8..4144c4389 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -562,7 +562,7 @@ static int64_t highbd_warp_error(
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
 
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
   conv_params.use_jnt_comp_avg = 0;
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
   conv_params.use_jnt_comp_avg = 0;
 
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index ce4032ee5..a1a4f067d 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_WARPED_MOTION_H_
-#define AV1_COMMON_WARPED_MOTION_H_
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mi_col);
 
 int get_shear_params(WarpedMotionParams *wm);
-#endif  // AV1_COMMON_WARPED_MOTION_H_
+#endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 0c5286f9d..d9fb53785 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index ae331b40d..5db2ccf6c 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -18,6 +18,12 @@
 #include "av1/common/x86/av1_inv_txfm_avx2.h"
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
                                       const __m256i _r, int8_t cos_bit) {
   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index 7b5b29cf8..f74cbaeaa 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
 
 #include <immintrin.h>
 
@@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
 }
 #endif
 
-#endif  // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
index dd7cee24c..995bc3da4 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -16,6 +16,12 @@
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 #include "av1/common/x86/av1_txfm_sse2.h"
 
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
 
 static void idct4_new_sse2(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
index dc9be25d2..66bd339d1 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
 
 #include <emmintrin.h>  // SSE2
 #include <tmmintrin.h>  // SSSE3
@@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
 };
 
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
-                                          4 * 5793 };
-
 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
 };
@@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
index 721cfe059..77aeb6eb1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -8,8 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -314,4 +314,4 @@ typedef struct {
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
-#endif  // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index 367e02096..6cad821b1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_TXFM_SSE4_H_
-#define AV1_TXFM_SSE4_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
 
 #include <smmintrin.h>
 
@@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
 static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
                                                         __m128i *output,
                                                         const int size,
-                                                        const int bit) {
-  const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+                                                        const int bit,
+                                                        const int val) {
+  const __m128i sqrt2 = _mm_set1_epi32(val);
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
@@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
 }
 #endif
 
-#endif  // AV1_TXFM_SSE4_H_
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
index 7479ac3e1..3b342cd4e 100644
--- a/third_party/aom/av1/common/x86/cfl_simd.h
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
 #include "av1/common/blockd.h"
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
@@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                              int dst_stride, int alpha_q3, int bd);
 void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                              int dst_stride, int alpha_q3, int bd);
+
+#endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index 1099144fe..0acafd044 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -11,10 +11,8 @@
 
 #include <immintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 637f83cf7..b1a62a4f6 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,9 +11,8 @@
 
 #include <emmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index f66dee37d..5016642de 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -11,9 +11,8 @@
 
 #include <emmintrin.h>
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
@@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
@@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_x_q4, const int subpel_y_q4,
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index 8444ffa93..ae68f0bbb 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index eb340523a..3f8dafb4b 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -15,7 +15,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 33183fdee..1d029db39 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index debb05a6d..ade2af03e 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -15,6 +15,9 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
 // Note:
 //  Total 32x4 registers to represent 32x32 block coefficients.
@@ -27,131 +30,125 @@
 //   ... ...
 //   v124, v125, v126, v127
 
-static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(u, max);
+  clamped = _mm256_andnot_si256(mask, u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+                                                 __m256i res0, __m256i res1,
+                                                 const int bd) {
+  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+  x0 = _mm256_add_epi32(res0, x0);
+  x1 = _mm256_add_epi32(res1, x1);
+  x0 = _mm256_packus_epi32(x0, x1);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+                                                 int stride, int flipud,
+                                                 int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+  }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
   __m256i x0, x1;
 
-  u0 = _mm256_unpacklo_epi32(in[0], in[4]);
-  u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
 
-  u2 = _mm256_unpacklo_epi32(in[8], in[12]);
-  u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
 
-  u4 = _mm256_unpacklo_epi32(in[16], in[20]);
-  u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
 
-  u6 = _mm256_unpacklo_epi32(in[24], in[28]);
-  u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
 
   x0 = _mm256_unpacklo_epi64(u0, u2);
   x1 = _mm256_unpacklo_epi64(u4, u6);
   out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpackhi_epi64(u0, u2);
   x1 = _mm256_unpackhi_epi64(u4, u6);
-  out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpacklo_epi64(u1, u3);
   x1 = _mm256_unpacklo_epi64(u5, u7);
-  out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
 
   x0 = _mm256_unpackhi_epi64(u1, u3);
   x1 = _mm256_unpackhi_epi64(u5, u7);
-  out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
-  out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
-}
-
-static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
-  transpose_32x32_8x8(&in[0], &out[0]);
-  transpose_32x32_8x8(&in[1], &out[32]);
-  transpose_32x32_8x8(&in[32], &out[1]);
-  transpose_32x32_8x8(&in[33], &out[33]);
-}
-
-static void transpose_32x32(const __m256i *in, __m256i *out) {
-  transpose_32x32_16x16(&in[0], &out[0]);
-  transpose_32x32_16x16(&in[2], &out[64]);
-  transpose_32x32_16x16(&in[64], &out[2]);
-  transpose_32x32_16x16(&in[66], &out[66]);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
 
-static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+                              int input_stiride, int size) {
   int i;
-  for (i = 0; i < 128; ++i) {
-    in[i] = _mm256_loadu_si256((const __m256i *)coeff);
-    coeff += 8;
+  for (i = 0; i < size; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
   }
 }
 
-static __m256i highbd_clamp_epi32(__m256i x, int bd) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(x, max);
-  clamped = _mm256_andnot_si256(mask, x);
-  mask = _mm256_and_si256(mask, max);
-  clamped = _mm256_or_si256(mask, clamped);
-  mask = _mm256_cmpgt_epi16(clamped, zero);
-  clamped = _mm256_and_si256(clamped, mask);
-
-  return clamped;
-}
-
-static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
-  const __m256i zero = _mm256_setzero_si256();
-  int i = 0;
-  (void)fliplr;
-  (void)flipud;
-
-  __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
-
-  while (i < 128) {
-    u0 = _mm256_loadu_si256((const __m256i *)output);
-    u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
-
-    x0 = _mm256_unpacklo_epi16(u0, zero);
-    x1 = _mm256_unpackhi_epi16(u0, zero);
-    x2 = _mm256_unpacklo_epi16(u1, zero);
-    x3 = _mm256_unpackhi_epi16(u1, zero);
-
-    v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
-    v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
-    v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
-    v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
-
-    v0 = _mm256_add_epi32(v0, round);
-    v1 = _mm256_add_epi32(v1, round);
-    v2 = _mm256_add_epi32(v2, round);
-    v3 = _mm256_add_epi32(v3, round);
-
-    v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
-    v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
-    v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
-    v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
-
-    v0 = _mm256_add_epi32(v0, x0);
-    v1 = _mm256_add_epi32(v1, x1);
-    v2 = _mm256_add_epi32(v2, x2);
-    v3 = _mm256_add_epi32(v3, x3);
-
-    v0 = _mm256_packus_epi32(v0, v1);
-    v2 = _mm256_packus_epi32(v2, v3);
-
-    v0 = highbd_clamp_epi32(v0, bd);
-    v2 = highbd_clamp_epi32(v2, bd);
-
-    _mm256_storeu_si256((__m256i *)output, v0);
-    _mm256_storeu_si256((__m256i *)(output + 16), v2);
-    output += stride;
-    i += 4;
-  }
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+                                      const __m256i *rounding, int bit) {
+  __m256i x;
+  x = _mm256_mullo_epi32(*w0, *n0);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
 }
 
 static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
@@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
   __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
   __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
 
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm256_max_epi32(a0, *clamp_lo);
   a0 = _mm256_min_epi32(a0, *clamp_hi);
   a1 = _mm256_max_epi32(a1, *clamp_lo);
   a1 = _mm256_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
 
+static INLINE void idct32_stage4_avx2(
+    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+  }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    x = _mm256_add_epi32(offset, x);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm256_max_epi32(x, clamp_lo_out);
+    x = _mm256_min_epi32(x, clamp_hi_out);
+  }
+
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+  out[8] = x;
+  out[9] = x;
+  out[10] = x;
+  out[11] = x;
+  out[12] = x;
+  out[13] = x;
+  out[14] = x;
+  out[15] = x;
+  out[16] = x;
+  out[17] = x;
+  out[18] = x;
+  out[19] = x;
+  out[20] = x;
+  out[21] = x;
+  out[22] = x;
+  out[23] = x;
+  out[24] = x;
+  out[25] = x;
+  out[26] = x;
+  out[27] = x;
+  out[28] = x;
+  out[29] = x;
+  out[30] = x;
+  out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[4] = in[4];
+    bf1[8] = in[2];
+    bf1[12] = in[6];
+    bf1[16] = in[1];
+    bf1[20] = in[5];
+    bf1[24] = in[3];
+    bf1[28] = in[7];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+    bf1[17] = bf1[16];
+    bf1[18] = bf1[19];
+    bf1[21] = bf1[20];
+    bf1[22] = bf1[23];
+    bf1[25] = bf1[24];
+    bf1[26] = bf1[27];
+    bf1[29] = bf1[28];
+    bf1[30] = bf1[31];
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+    bf1[9] = bf1[8];
+    bf1[10] = bf1[11];
+    bf1[13] = bf1[12];
+    bf1[14] = bf1[15];
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[5] = bf1[4];
+    bf1[6] = bf1[7];
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    bf1[3] = bf1[0];
+    bf1[2] = bf1[1];
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[2] = in[8];
+    bf1[4] = in[4];
+    bf1[6] = in[12];
+    bf1[8] = in[2];
+    bf1[10] = in[10];
+    bf1[12] = in[6];
+    bf1[14] = in[14];
+    bf1[16] = in[1];
+    bf1[18] = in[9];
+    bf1[20] = in[5];
+    bf1[22] = in[13];
+    bf1[24] = in[3];
+    bf1[26] = in[11];
+    bf1[28] = in[7];
+    bf1[30] = in[15];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
                         int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
@@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i bf1[32], bf0[32];
-  int col;
 
-  for (col = 0; col < 4; ++col) {
+  {
     // stage 0
     // stage 1
-    bf1[0] = in[0 * 4 + col];
-    bf1[1] = in[16 * 4 + col];
-    bf1[2] = in[8 * 4 + col];
-    bf1[3] = in[24 * 4 + col];
-    bf1[4] = in[4 * 4 + col];
-    bf1[5] = in[20 * 4 + col];
-    bf1[6] = in[12 * 4 + col];
-    bf1[7] = in[28 * 4 + col];
-    bf1[8] = in[2 * 4 + col];
-    bf1[9] = in[18 * 4 + col];
-    bf1[10] = in[10 * 4 + col];
-    bf1[11] = in[26 * 4 + col];
-    bf1[12] = in[6 * 4 + col];
-    bf1[13] = in[22 * 4 + col];
-    bf1[14] = in[14 * 4 + col];
-    bf1[15] = in[30 * 4 + col];
-    bf1[16] = in[1 * 4 + col];
-    bf1[17] = in[17 * 4 + col];
-    bf1[18] = in[9 * 4 + col];
-    bf1[19] = in[25 * 4 + col];
-    bf1[20] = in[5 * 4 + col];
-    bf1[21] = in[21 * 4 + col];
-    bf1[22] = in[13 * 4 + col];
-    bf1[23] = in[29 * 4 + col];
-    bf1[24] = in[3 * 4 + col];
-    bf1[25] = in[19 * 4 + col];
-    bf1[26] = in[11 * 4 + col];
-    bf1[27] = in[27 * 4 + col];
-    bf1[28] = in[7 * 4 + col];
-    bf1[29] = in[23 * 4 + col];
-    bf1[30] = in[15 * 4 + col];
-    bf1[31] = in[31 * 4 + col];
+    bf1[0] = in[0];
+    bf1[1] = in[16];
+    bf1[2] = in[8];
+    bf1[3] = in[24];
+    bf1[4] = in[4];
+    bf1[5] = in[20];
+    bf1[6] = in[12];
+    bf1[7] = in[28];
+    bf1[8] = in[2];
+    bf1[9] = in[18];
+    bf1[10] = in[10];
+    bf1[11] = in[26];
+    bf1[12] = in[6];
+    bf1[13] = in[22];
+    bf1[14] = in[14];
+    bf1[15] = in[30];
+    bf1[16] = in[1];
+    bf1[17] = in[17];
+    bf1[18] = in[9];
+    bf1[19] = in[25];
+    bf1[20] = in[5];
+    bf1[21] = in[21];
+    bf1[22] = in[13];
+    bf1[23] = in[29];
+    bf1[24] = in[3];
+    bf1[25] = in[19];
+    bf1[26] = in[11];
+    bf1[27] = in[27];
+    bf1[28] = in[7];
+    bf1[29] = in[23];
+    bf1[30] = in[15];
+    bf1[31] = in[31];
 
     // stage 2
     bf0[0] = bf1[0];
@@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
 
     // stage 9
     if (do_cols) {
-      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
-                           out + 31 * 4 + col);
-      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
-                           out + 30 * 4 + col);
-      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
-                           out + 29 * 4 + col);
-      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
-                           out + 28 * 4 + col);
-      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
-                           out + 27 * 4 + col);
-      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
-                           out + 26 * 4 + col);
-      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
-                           out + 25 * 4 + col);
-      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
-                           out + 24 * 4 + col);
-      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
-                           out + 23 * 4 + col);
-      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
-                           out + 22 * 4 + col);
-      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
-                           out + 21 * 4 + col);
-      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
-                           out + 20 * 4 + col);
-      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
-                           out + 19 * 4 + col);
-      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
-                           out + 18 * 4 + col);
-      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
-                           out + 17 * 4 + col);
-      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
-                           out + 16 * 4 + col);
+      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
     } else {
-      addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
-                        &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
-                        out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
-                        out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
-                        out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
-                        out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
-                        out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
-                        out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
     }
   }
 }
 
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
-                                   int stride, TX_TYPE tx_type, int bd) {
-  __m256i in[128], out[128];
-  const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
-  const int txw_idx = get_txw_idx(TX_32X32);
-  const int txh_idx = get_txh_idx(TX_32X32);
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+                                  int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+                                                   uint16_t *output, int stride,
+                                                   TX_TYPE tx_type,
+                                                   TX_SIZE tx_size, int eob,
+                                                   const int bd) {
+  __m256i buf1[64 * 2];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m256i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m256i *buf0_cur = buf0 + j * 8;
+      load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+      transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+    }
+
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m256i *_buf1 = buf1 + i * 8;
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+                                    output + 16 * i, stride, ud_flip,
+                                    txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_32x32(coeff, in);
-      transpose_32x32(in, out);
-      idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
-      transpose_32x32(in, out);
-      idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
+      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
       break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+                                              txfm_param->tx_size,
+                                              txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+
     default: assert(0);
   }
 }
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 801a4133b..e29e0baf5 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -15,8 +15,60 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+                                                  __m128i res0, __m128i res1,
+                                                  const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+  x0 = _mm_add_epi32(res0, x0);
+  x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  x0 = highbd_clamp_epi16(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+  }
+}
+
 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
@@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
   __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
   __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
 
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm_max_epi32(a0, *clamp_lo);
   a0 = _mm_min_epi32(a0, *clamp_hi);
   a1 = _mm_max_epi32(a1, *clamp_lo);
   a1 = _mm_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
 
+static INLINE void idct32_stage4_sse4_1(
+    __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+    const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+    const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+    __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+    const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+    const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+    __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] =
+      half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 =
+      half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] =
+      half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+                                        const int do_cols, const int bd,
+                                        const int out_shift,
+                                        const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
+
 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
                              __m128i *out0, __m128i *out1,
                              const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
   __m128i a0 = _mm_add_epi32(offset, in0);
   __m128i a1 = _mm_sub_epi32(offset, in1);
 
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
   a0 = _mm_max_epi32(a0, *clamp_lo);
   a0 = _mm_min_epi32(a0, *clamp_hi);
   a1 = _mm_max_epi32(a1, *clamp_lo);
   a1 = _mm_min_epi32(a1, *clamp_hi);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
   *out0 = a0;
   *out1 = a1;
 }
@@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
@@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
-  addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+    addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+  } else {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  }
 }
 
-static void iadst4x4_sse4_1(__m128i *in, int bit) {
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
@@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) {
   u3 = _mm_add_epi32(u3, rnding);
   u3 = _mm_srai_epi32(u3, bit);
 
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    u0 = _mm_max_epi32(u0, clamp_lo);
+    u0 = _mm_min_epi32(u0, clamp_hi);
+    u1 = _mm_max_epi32(u1, clamp_lo);
+    u1 = _mm_min_epi32(u1, clamp_hi);
+    u2 = _mm_max_epi32(u2, clamp_lo);
+    u2 = _mm_min_epi32(u2, clamp_hi);
+    u3 = _mm_max_epi32(u3, clamp_lo);
+    u3 = _mm_min_epi32(u3, clamp_hi);
+  }
+
   in[0] = u0;
   in[1] = u1;
   in[2] = u2;
@@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) {
   in[3] = _mm_srai_epi32(in[3], shift);
 }
 
-static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-
-  mask = _mm_cmpgt_epi16(u, max);
-  clamped = _mm_andnot_si128(mask, u);
-  mask = _mm_and_si128(mask, max);
-  clamped = _mm_or_si128(mask, clamped);
-  mask = _mm_cmpgt_epi16(clamped, zero);
-  clamped = _mm_and_si128(clamped, mask);
-
-  return clamped;
-}
-
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
                              int fliplr, int flipud, int shift, int bd) {
   const __m128i zero = _mm_setzero_si128();
@@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
     case ADST_DCT:
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     default: assert(0);
@@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
       addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
       addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
     } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
       addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
       addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
     }
   }
 }
@@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     out[12] = u[5];
     out[14] = _mm_sub_epi32(kZero, u[1]);
   } else {
-    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
-    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 
   // Odd 8 points: 1, 3, ..., 15
@@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     out[13] = u[5];
     out[15] = _mm_sub_epi32(kZero, u[1]);
   } else {
-    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
-                     out_shift);
-    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
-    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
                      out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -976,81 +1258,51 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
   }
 }
 
-// 16x16
-static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
-  int i;
-  for (i = 0; i < 64; ++i) {
-    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
-  }
-}
-
-static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
-                                        int col) {
-  int i;
-  for (i = 0; i < 16; i += 2) {
-    in8x8[i] = in[col];
-    in8x8[i + 1] = in[col + 1];
-    col += 4;
-  }
-}
-
-static void swap_addr(uint16_t **output1, uint16_t **output2) {
-  uint16_t *tmp;
-  tmp = *output1;
-  *output1 = *output2;
-  *output2 = tmp;
-}
-
-static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in8x8[16];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[8];
-  uint16_t *leftDown = &output[8 * stride];
-  uint16_t *rightDown = &output[8 * stride + 8];
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i x;
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm_mullo_epi32(in[0], cospi32);
+  x = _mm_add_epi32(x, rnding);
+  x = _mm_srai_epi32(x, bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    x = _mm_add_epi32(x, offset);
+    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm_max_epi32(x, clamp_lo_out);
+    x = _mm_min_epi32(x, clamp_hi_out);
   }
 
-  // Left-up quarter
-  assign_8x8_input_from_16x16(in, in8x8, 0);
-  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
-
-  // Right-up quarter
-  assign_8x8_input_from_16x16(in, in8x8, 2);
-  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
-
-  // Left-down quarter
-  assign_8x8_input_from_16x16(in, in8x8, 32);
-  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
-
-  // Right-down quarter
-  assign_8x8_input_from_16x16(in, in8x8, 34);
-  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
 }
 
-static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
-                             int bd, int out_shift) {
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
-  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
-  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
-  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
-  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
@@ -1059,473 +1311,687 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i u[16], v[16], x, y;
-  int col;
-
-  for (col = 0; col < 4; ++col) {
-    // stage 0
-    // stage 1
-    u[0] = in[0 * 4 + col];
-    u[1] = in[8 * 4 + col];
-    u[2] = in[4 * 4 + col];
-    u[3] = in[12 * 4 + col];
-    u[4] = in[2 * 4 + col];
-    u[5] = in[10 * 4 + col];
-    u[6] = in[6 * 4 + col];
-    u[7] = in[14 * 4 + col];
-    u[8] = in[1 * 4 + col];
-    u[9] = in[9 * 4 + col];
-    u[10] = in[5 * 4 + col];
-    u[11] = in[13 * 4 + col];
-    u[12] = in[3 * 4 + col];
-    u[13] = in[11 * 4 + col];
-    u[14] = in[7 * 4 + col];
-    u[15] = in[15 * 4 + col];
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
 
-    // stage 2
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm_mullo_epi32(in[1], cospi56);
+  y = _mm_mullo_epi32(in[7], cospim8);
+  u4 = _mm_add_epi32(x, y);
+  u4 = _mm_add_epi32(u4, rnding);
+  u4 = _mm_srai_epi32(u4, bit);
+
+  x = _mm_mullo_epi32(in[1], cospi8);
+  y = _mm_mullo_epi32(in[7], cospi56);
+  u7 = _mm_add_epi32(x, y);
+  u7 = _mm_add_epi32(u7, rnding);
+  u7 = _mm_srai_epi32(u7, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi24);
+  y = _mm_mullo_epi32(in[3], cospim40);
+  u5 = _mm_add_epi32(x, y);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi40);
+  y = _mm_mullo_epi32(in[3], cospi24);
+  u6 = _mm_add_epi32(x, y);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
 
-    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+  // stage 3
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u1, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
 
-    // stage 3
-    u[0] = v[0];
-    u[1] = v[1];
-    u[2] = v[2];
-    u[3] = v[3];
-    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
-    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
-    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
-    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
-    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
 
-    // stage 4
-    x = _mm_mullo_epi32(u[0], cospi32);
-    y = _mm_mullo_epi32(u[1], cospi32);
-    v[0] = _mm_add_epi32(x, y);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  x = _mm_mullo_epi32(u2, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
 
-    v[1] = _mm_sub_epi32(x, y);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  x = _mm_mullo_epi32(u2, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
 
-    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
-    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
-    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
-    v[8] = u[8];
-    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    v[11] = u[11];
-    v[12] = u[12];
-    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
-    v[15] = u[15];
+  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 
-    // stage 5
-    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
-    u[4] = v[4];
+  // stage 4
+  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
 
-    x = _mm_mullo_epi32(v[5], cospi32);
-    y = _mm_mullo_epi32(v[6], cospi32);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  x = _mm_mullo_epi32(v5, cospi32);
+  y = _mm_mullo_epi32(v6, cospi32);
+  u6 = _mm_add_epi32(y, x);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
 
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  u5 = _mm_sub_epi32(y, x);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
 
-    u[7] = v[7];
-    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  // stage 5
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+    addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+    addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+    addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+  }
+}
 
-    // stage 6
-    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
-    v[8] = u[8];
-    v[9] = u[9];
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], x;
 
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[13], cospi32);
-    v[10] = _mm_sub_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 0
+  // stage 1
+  // stage 2
 
-    v[13] = _mm_add_epi32(x, y);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    x = _mm_mullo_epi32(u[11], cospi32);
-    y = _mm_mullo_epi32(u[12], cospi32);
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(kZero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    v[12] = _mm_add_epi32(x, y);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  // stage 3
+  // stage 4
+  __m128i temp1, temp2;
+  temp1 = _mm_mullo_epi32(u[0], cospi16);
+  x = _mm_mullo_epi32(u[1], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm_mullo_epi32(u[0], cospi48);
+  x = _mm_mullo_epi32(u[1], cospi16);
+  u[5] = _mm_sub_epi32(temp2, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[14] = u[14];
-    v[15] = u[15];
+  // stage 5
+  // stage 6
+  temp1 = _mm_mullo_epi32(u[0], cospi32);
+  x = _mm_mullo_epi32(u[1], cospi32);
+  u[2] = _mm_add_epi32(temp1, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
-                             out + 15 * 4 + col);
-      addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
-                             out + 14 * 4 + col);
-      addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
-                             out + 13 * 4 + col);
-      addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
-                             out + 12 * 4 + col);
-      addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
-                             out + 11 * 4 + col);
-      addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
-                             out + 10 * 4 + col);
-      addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
-      addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
-    } else {
-      addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-      addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
-                          &clamp_lo, &clamp_hi, out_shift);
-    }
+  u[3] = _mm_sub_epi32(temp1, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  temp1 = _mm_mullo_epi32(u[4], cospi32);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  u[6] = _mm_add_epi32(temp1, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(temp1, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
   }
 }
 
-static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
-                              int bd, int out_shift) {
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
-  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
-  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
-  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
-  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
-  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
-  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
-  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
-  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  __m128i u[16], v[16], x, y;
-  const int col_num = 4;
-  int col;
+  __m128i u[8], v[8], x;
 
-  // Calculate the column 0, 1, 2, 3
-  for (col = 0; col < col_num; ++col) {
-    // stage 0
-    // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+  // stage 0
+  // stage 1
+  // stage 2
 
-    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  u[0] = _mm_mullo_epi32(in[7], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  u[1] = _mm_mullo_epi32(in[7], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  // (2)
+  u[2] = _mm_mullo_epi32(in[5], cospi20);
+  x = _mm_mullo_epi32(in[2], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+  u[3] = _mm_mullo_epi32(in[5], cospi44);
+  x = _mm_mullo_epi32(in[2], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+  // (3)
+  u[4] = _mm_mullo_epi32(in[3], cospi36);
+  x = _mm_mullo_epi32(in[4], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  u[5] = _mm_mullo_epi32(in[3], cospi28);
+  x = _mm_mullo_epi32(in[4], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  // (4)
+  u[6] = _mm_mullo_epi32(in[1], cospi52);
+  x = _mm_mullo_epi32(in[6], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  u[7] = _mm_mullo_epi32(in[1], cospi12);
+  x = _mm_mullo_epi32(in[6], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
 
-    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 
-    // stage 3
-    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
 
-    // stage 4
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
 
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    in[0] = _mm_mullo_epi32(in[0], cospi32);
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[0] = _mm_srai_epi32(in[0], bit);
 
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 5
+    // stage 6
+    // stage 7
+    if (do_cols) {
+      in[0] = _mm_max_epi32(in[0], clamp_lo);
+      in[0] = _mm_min_epi32(in[0], clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm_add_epi32(in[0], offset);
+      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+      in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+      in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+    }
+
+    out[0] = in[0];
+    out[1] = in[0];
+    out[2] = in[0];
+    out[3] = in[0];
+    out[4] = in[0];
+    out[5] = in[0];
+    out[6] = in[0];
+    out[7] = in[0];
+    out[8] = in[0];
+    out[9] = in[0];
+    out[10] = in[0];
+    out[11] = in[0];
+    out[12] = in[0];
+    out[13] = in[0];
+    out[14] = in[0];
+    out[15] = in[0];
+  }
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[2] = in[4];
+    u[4] = in[2];
+    u[6] = in[6];
+    u[8] = in[1];
+    u[10] = in[5];
+    u[12] = in[3];
+    u[14] = in[7];
+
+    // stage 2
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+    u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+    u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    // stage 3
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+    addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+    u[1] = u[0];
+
+    u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+    u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+    x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = x;
+    y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = y;
 
     // stage 5
-    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    x = _mm_mullo_epi32(u[5], cospi32);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
     // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
+    addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    u[10] = _mm_sub_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
+    u[13] = _mm_add_epi32(x, y);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
 
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+    u[12] = _mm_add_epi32(x, y);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
 
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v[16], x, y, temp1, temp2;
 
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(x, rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
 
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(zero, x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
 
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+    // stage 3
+    v[8] = v[0];
+    v[9] = v[1];
 
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 4
+    temp1 = _mm_mullo_epi32(v[8], cospi8);
+    x = _mm_mullo_epi32(v[9], cospi56);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[8], cospi56);
+    x = _mm_mullo_epi32(v[9], cospi8);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[8] = temp1;
+    v[9] = temp2;
+
+    // stage 5
+    v[4] = v[0];
+    v[5] = v[1];
+    v[12] = v[8];
+    v[13] = v[9];
+
+    // stage 6
+    temp1 = _mm_mullo_epi32(v[4], cospi16);
+    x = _mm_mullo_epi32(v[5], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[4], cospi48);
+    x = _mm_mullo_epi32(v[5], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[4] = temp1;
+    v[5] = temp2;
+
+    temp1 = _mm_mullo_epi32(v[12], cospi16);
+    x = _mm_mullo_epi32(v[13], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[12], cospi48);
+    x = _mm_mullo_epi32(v[13], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[12] = temp1;
+    v[13] = temp2;
 
     // stage 7
-    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+    v[2] = v[0];
+    v[3] = v[1];
+    v[6] = v[4];
+    v[7] = v[5];
+    v[10] = v[8];
+    v[11] = v[9];
+    v[14] = v[12];
+    v[15] = v[13];
 
     // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
-
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
+    y = _mm_mullo_epi32(v[2], cospi32);
+    x = _mm_mullo_epi32(v[3], cospi32);
     v[2] = _mm_add_epi32(y, x);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
@@ -1534,11 +2000,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = u[4];
-    v[5] = u[5];
-
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    x = _mm_mullo_epi32(v[7], cospi32);
     v[6] = _mm_add_epi32(y, x);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
@@ -1547,11 +2010,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(v[10], cospi32);
+    x = _mm_mullo_epi32(v[11], cospi32);
     v[10] = _mm_add_epi32(y, x);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
@@ -1560,11 +2020,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = u[12];
-    v[13] = u[13];
-
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
+    y = _mm_mullo_epi32(v[14], cospi32);
+    x = _mm_mullo_epi32(v[15], cospi32);
     v[14] = _mm_add_epi32(y, x);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
@@ -1575,439 +2032,1904 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
 
     // stage 9
     if (do_cols) {
-      out[0 * col_num + col] = v[0];
-      out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2 * col_num + col] = v[12];
-      out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4 * col_num + col] = v[6];
-      out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6 * col_num + col] = v[10];
-      out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8 * col_num + col] = v[3];
-      out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10 * col_num + col] = v[15];
-      out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12 * col_num + col] = v[5];
-      out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14 * col_num + col] = v[9];
-      out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
     } else {
-      neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
-                       out + 1 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
-                       out + 3 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
-                       out + 5 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
-                       out + 7 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
-                       out + 9 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
-                       out + 11 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
-                       out + 13 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
-                       out + 15 * col_num + col, &clamp_lo, &clamp_hi,
-                       out_shift);
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
     }
   }
 }
 
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
-  __m128i in[64], out[64];
-  const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
-  const int txw_idx = get_txw_idx(TX_16X16);
-  const int txh_idx = get_txh_idx(TX_16X16);
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
 
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(coeff, in);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        -shift[0]);
-      transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
-      break;
-    default: assert(0);
-  }
-}
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    __m128i zero = _mm_setzero_si128();
+    x = _mm_mullo_epi32(in[0], cospi62);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+
+    x = _mm_mullo_epi32(in[0], cospi2);
+    u[1] = _mm_sub_epi32(zero, x);
+    u[1] = _mm_add_epi32(u[1], rnding);
+    u[1] = _mm_srai_epi32(u[1], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi54);
+    u[2] = _mm_add_epi32(x, rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi10);
+    u[3] = _mm_sub_epi32(zero, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi46);
+    u[4] = _mm_add_epi32(x, rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi18);
+    u[5] = _mm_sub_epi32(zero, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
 
-static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
-  int i, j;
+    x = _mm_mullo_epi32(in[6], cospi38);
+    u[6] = _mm_add_epi32(x, rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
 
-  __m128i zero = _mm_setzero_si128();
+    x = _mm_mullo_epi32(in[6], cospi26);
+    u[7] = _mm_sub_epi32(zero, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
 
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 8; ++j) {
-      in[16 * i + j] =
-          _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
-      in[16 * i + j + 8] = zero;
-    }
-  }
+    u[8] = _mm_mullo_epi32(in[7], cospi34);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
 
-  for (i = 0; i < 512; ++i) in[512 + i] = zero;
-}
+    u[9] = _mm_mullo_epi32(in[7], cospi30);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
 
-static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
-  int i, j;
-  for (i = 0; i < (do_cols ? 16 : 8); ++i) {
-    for (j = 0; j < 8; ++j) {
-      TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
-                    in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
-                    out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
-                    out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
-    }
-  }
-}
+    u[10] = _mm_mullo_epi32(in[5], cospi42);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
-                                          int col) {
-  int i;
-  for (i = 0; i < 16 * 16 / 4; i += 4) {
-    in16x16[i] = in[col];
-    in16x16[i + 1] = in[col + 1];
-    in16x16[i + 2] = in[col + 2];
-    in16x16[i + 3] = in[col + 3];
-    col += 8;
-  }
-}
+    u[11] = _mm_mullo_epi32(in[5], cospi22);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in16x16[16 * 16 / 4];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[16];
-  uint16_t *leftDown = &output[16 * stride];
-  uint16_t *rightDown = &output[16 * stride + 16];
+    u[12] = _mm_mullo_epi32(in[3], cospi50);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+    u[13] = _mm_mullo_epi32(in[3], cospi14);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
-  }
+    u[14] = _mm_mullo_epi32(in[1], cospi58);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
 
-  // Left-up quarter
-  assign_16x16_input_from_32x32(in, in16x16, 0);
-  write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+    u[15] = _mm_mullo_epi32(in[1], cospi6);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
 
-  // Right-up quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
-  write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+    // stage 3
+    addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-  // Left-down quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
-  write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+    // stage 4
+    y = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    u[8] = _mm_mullo_epi32(u[8], cospi8);
+    u[8] = _mm_add_epi32(u[8], x);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
 
-  // Right-down quarter
-  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
-  write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
-}
+    x = _mm_mullo_epi32(u[9], cospi8);
+    u[9] = _mm_sub_epi32(y, x);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
 
-static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
-                                          int col) {
-  int i;
-  for (i = 0; i < 32 * 32 / 4; i += 8) {
-    in32x32[i] = in[col];
-    in32x32[i + 1] = in[col + 1];
-    in32x32[i + 2] = in[col + 2];
-    in32x32[i + 3] = in[col + 3];
-    in32x32[i + 4] = in[col + 4];
-    in32x32[i + 5] = in[col + 5];
-    in32x32[i + 6] = in[col + 6];
-    in32x32[i + 7] = in[col + 7];
-    col += 16;
-  }
-}
+    x = _mm_mullo_epi32(u[11], cospi24);
+    y = _mm_mullo_epi32(u[10], cospi24);
+    u[10] = _mm_mullo_epi32(u[10], cospi40);
+    u[10] = _mm_add_epi32(u[10], x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
 
-static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
-                               int fliplr, int flipud, int shift, int bd) {
-  __m128i in32x32[32 * 32 / 4];
-  uint16_t *leftUp = &output[0];
-  uint16_t *rightUp = &output[32];
-  uint16_t *leftDown = &output[32 * stride];
-  uint16_t *rightDown = &output[32 * stride + 32];
+    x = _mm_mullo_epi32(u[11], cospi40);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
 
-  if (fliplr) {
-    swap_addr(&leftUp, &rightUp);
-    swap_addr(&leftDown, &rightDown);
-  }
+    x = _mm_mullo_epi32(u[13], cospi8);
+    y = _mm_mullo_epi32(u[12], cospi8);
+    u[12] = _mm_mullo_epi32(u[12], cospim56);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
 
-  if (flipud) {
-    swap_addr(&leftUp, &leftDown);
-    swap_addr(&rightUp, &rightDown);
-  }
+    x = _mm_mullo_epi32(u[13], cospim56);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi40);
+    y = _mm_mullo_epi32(u[14], cospi40);
+    u[14] = _mm_mullo_epi32(u[14], cospim24);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim24);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 5
+    addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    x = _mm_mullo_epi32(u[5], cospi48);
+    y = _mm_mullo_epi32(u[4], cospi48);
+    u[4] = _mm_mullo_epi32(u[4], cospi16);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(u[5], cospi16);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    x = _mm_mullo_epi32(u[7], cospi16);
+    y = _mm_mullo_epi32(u[6], cospi16);
+    u[6] = _mm_mullo_epi32(u[6], cospim48);
+    u[6] = _mm_add_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    x = _mm_mullo_epi32(u[7], cospim48);
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi48);
+    y = _mm_mullo_epi32(u[12], cospi48);
+    u[12] = _mm_mullo_epi32(u[12], cospi16);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi16);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi16);
+    y = _mm_mullo_epi32(u[14], cospi16);
+    u[14] = _mm_mullo_epi32(u[14], cospim48);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim48);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 7
+    addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    u[2] = _mm_add_epi32(y, x);
+    u[2] = _mm_add_epi32(u[2], rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    u[3] = _mm_sub_epi32(y, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
 
-  // Left-up quarter
-  assign_32x32_input_from_64x64(in, in32x32, 0);
-  write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
 
-  // Right-up quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
-  write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    u[10] = _mm_add_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    u[14] = _mm_add_epi32(y, x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
 
-  // Left-down quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
-  write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
 
-  // Right-down quarter
-  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
-  write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+    // stage 9
+    if (do_cols) {
+      out[0] = u[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+      out[2] = u[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+      out[4] = u[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+      out[6] = u[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+      out[8] = u[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+      out[10] = u[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+      out[12] = u[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+      out[14] = u[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
 }
 
-static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
                              int bd, int out_shift) {
-  int i, j;
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-  int col;
-
-  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
-  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
-  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
-  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
-  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
-  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
-  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
-  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
-  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
-  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
-  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
-  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
-  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
-  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
-  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
-  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
-  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
-  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
-  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
-  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
-
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
-  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
-  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
-  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
-  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
-  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
-  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
-  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
-  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
-  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
-  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
-  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
-  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
-
-  for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
-    __m128i u[64], v[64];
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
 
+  {
+    // stage 0
     // stage 1
-    u[32] = in[1 * 16 + col];
-    u[34] = in[17 * 16 + col];
-    u[36] = in[9 * 16 + col];
-    u[38] = in[25 * 16 + col];
-    u[40] = in[5 * 16 + col];
-    u[42] = in[21 * 16 + col];
-    u[44] = in[13 * 16 + col];
-    u[46] = in[29 * 16 + col];
-    u[48] = in[3 * 16 + col];
-    u[50] = in[19 * 16 + col];
-    u[52] = in[11 * 16 + col];
-    u[54] = in[27 * 16 + col];
-    u[56] = in[7 * 16 + col];
-    u[58] = in[23 * 16 + col];
-    u[60] = in[15 * 16 + col];
-    u[62] = in[31 * 16 + col];
-
-    v[16] = in[2 * 16 + col];
-    v[18] = in[18 * 16 + col];
-    v[20] = in[10 * 16 + col];
-    v[22] = in[26 * 16 + col];
-    v[24] = in[6 * 16 + col];
-    v[26] = in[22 * 16 + col];
-    v[28] = in[14 * 16 + col];
-    v[30] = in[30 * 16 + col];
-
-    u[8] = in[4 * 16 + col];
-    u[10] = in[20 * 16 + col];
-    u[12] = in[12 * 16 + col];
-    u[14] = in[28 * 16 + col];
-
-    v[4] = in[8 * 16 + col];
-    v[6] = in[24 * 16 + col];
-
-    u[0] = in[0 * 16 + col];
-    u[2] = in[16 * 16 + col];
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
 
     // stage 2
-    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
-    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
-    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
-    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
-    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
-    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
-    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
-    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
-    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
-    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
-    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
-    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
-    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
-    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
-    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
-    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
-    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
-    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
-    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
-    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
-    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
-    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
-    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
-    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
-    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
-    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
-    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
-    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
-    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
-    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
-    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
-    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
 
-    // stage 3
-    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
-    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
-    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15], cospi2);
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15], cospi62);
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13], cospi10);
+    x = _mm_mullo_epi32(in[2], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13], cospi54);
+    x = _mm_mullo_epi32(in[2], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11], cospi18);
+    x = _mm_mullo_epi32(in[4], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11], cospi46);
+    x = _mm_mullo_epi32(in[4], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9], cospi26);
+    x = _mm_mullo_epi32(in[6], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9], cospi38);
+    x = _mm_mullo_epi32(in[6], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7], cospi34);
+    x = _mm_mullo_epi32(in[8], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7], cospi30);
+    x = _mm_mullo_epi32(in[8], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5], cospi42);
+    x = _mm_mullo_epi32(in[10], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5], cospi22);
+    x = _mm_mullo_epi32(in[10], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3], cospi50);
+    x = _mm_mullo_epi32(in[12], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3], cospi14);
+    x = _mm_mullo_epi32(in[12], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1], cospi58);
+    x = _mm_mullo_epi32(in[14], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1], cospi6);
+    x = _mm_mullo_epi32(in[14], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static INLINE void idct64_stage8_sse4_1(
+    __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+                  clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+                                         const __m128i *cospi32,
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi,
+                                         const __m128i *rnding, int bit) {
+  __m128i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+                                         int bd, int out_shift,
+                                         const int log_range) {
+  if (do_cols) {
+    for (int i = 0; i < 32; i++) {
+      addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+    }
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    for (int i = 0; i < 32; i++) {
+      addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+  {
+    __m128i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (do_cols) {
+      x = _mm_max_epi32(x, clamp_lo);
+      x = _mm_min_epi32(x, clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      x = _mm_add_epi32(x, offset);
+      x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+      x = _mm_max_epi32(x, clamp_lo_out);
+      x = _mm_min_epi32(x, clamp_hi_out);
+    }
+
+    out[0] = x;
+    out[63] = x;
+    out[1] = x;
+    out[62] = x;
+    out[2] = x;
+    out[61] = x;
+    out[3] = x;
+    out[60] = x;
+    out[4] = x;
+    out[59] = x;
+    out[5] = x;
+    out[58] = x;
+    out[6] = x;
+    out[57] = x;
+    out[7] = x;
+    out[56] = x;
+    out[8] = x;
+    out[55] = x;
+    out[9] = x;
+    out[54] = x;
+    out[10] = x;
+    out[53] = x;
+    out[11] = x;
+    out[52] = x;
+    out[12] = x;
+    out[51] = x;
+    out[13] = x;
+    out[50] = x;
+    out[14] = x;
+    out[49] = x;
+    out[15] = x;
+    out[48] = x;
+    out[16] = x;
+    out[47] = x;
+    out[17] = x;
+    out[46] = x;
+    out[18] = x;
+    out[45] = x;
+    out[19] = x;
+    out[44] = x;
+    out[20] = x;
+    out[43] = x;
+    out[21] = x;
+    out[42] = x;
+    out[22] = x;
+    out[41] = x;
+    out[23] = x;
+    out[40] = x;
+    out[24] = x;
+    out[39] = x;
+    out[25] = x;
+    out[38] = x;
+    out[26] = x;
+    out[37] = x;
+    out[27] = x;
+    out[36] = x;
+    out[28] = x;
+    out[35] = x;
+    out[29] = x;
+    out[34] = x;
+    out[30] = x;
+    out[33] = x;
+    out[31] = x;
+    out[32] = x;
+  }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+  {
+    __m128i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m128i temp1, temp2;
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+    u[9] = u[9];
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64];
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
@@ -2039,301 +3961,1388 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
 
-    for (i = 16; i < 32; i += 4) {
-      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
-                    &clamp_hi);
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
     }
 
-    for (i = 32; i < 64; i += 4) {
-      v[i + 0] = u[i + 0];
-      v[i + 3] = u[i + 3];
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
     }
 
-    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
-    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
-    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
-    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
-    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
-    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
-    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
-    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
 
-    // stage 5
-    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
-    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
-    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
-    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+    for (i = 32; i < 40; i++) v[i] = u[i];
 
-    for (i = 8; i < 16; i += 4) {
-      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
-                    &clamp_hi);
-    }
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
 
-    for (i = 16; i < 32; i += 4) {
-      u[i + 0] = v[i + 0];
-      u[i + 3] = v[i + 3];
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    if (do_cols) {
+      for (i = 0; i < 32; i++) {
+        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
+      }
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      for (i = 0; i < 32; i++) {
+        addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+                            &clamp_lo_out, &clamp_hi_out, out_shift);
+      }
     }
+  }
+}
 
-    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
-    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
-    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
-    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
-    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
-    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1;
 
-    for (i = 32; i < 64; i += 8) {
-      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
-                    &clamp_hi);
+  // stage 0
+  // stage 1
+  bf1 = in[0];
 
-      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
 
-    // stage 6
-    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
-    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
-    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
-    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    bf1 = _mm_max_epi32(bf1, clamp_lo);
+    bf1 = _mm_min_epi32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    bf1 = _mm_add_epi32(bf1, offset);
+    bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+    bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+  }
+  out[0] = bf1;
+  out[1] = bf1;
+  out[2] = bf1;
+  out[3] = bf1;
+  out[4] = bf1;
+  out[5] = bf1;
+  out[6] = bf1;
+  out[7] = bf1;
+  out[8] = bf1;
+  out[9] = bf1;
+  out[10] = bf1;
+  out[11] = bf1;
+  out[12] = bf1;
+  out[13] = bf1;
+  out[14] = bf1;
+  out[15] = bf1;
+  out[16] = bf1;
+  out[17] = bf1;
+  out[18] = bf1;
+  out[19] = bf1;
+  out[20] = bf1;
+  out[21] = bf1;
+  out[22] = bf1;
+  out[23] = bf1;
+  out[24] = bf1;
+  out[25] = bf1;
+  out[26] = bf1;
+  out[27] = bf1;
+  out[28] = bf1;
+  out[29] = bf1;
+  out[30] = bf1;
+  out[31] = bf1;
+}
 
-    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
 
-    for (i = 8; i < 16; i += 4) {
-      v[i + 0] = u[i + 0];
-      v[i + 3] = u[i + 3];
-    }
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
 
-    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
 
-    for (i = 16; i < 32; i += 8) {
-      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
-                    &clamp_hi);
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
 
-      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
-                    &clamp_hi);
-      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
 
-    for (i = 32; i < 64; i += 8) {
-      v[i + 0] = u[i + 0];
-      v[i + 1] = u[i + 1];
-      v[i + 6] = u[i + 6];
-      v[i + 7] = u[i + 7];
-    }
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
 
-    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
-    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
-    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
-    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
-    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
-    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
-    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
-    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
 
-    // stage 7
-    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+  bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+  bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+  bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+  bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+  bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+  bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+  bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+  bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+  bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+  bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+  addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+  bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+  bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+  bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
 
-    u[4] = v[4];
-    u[7] = v[7];
-    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
-    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+  addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
 
-    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
 
-    for (i = 16; i < 32; i += 8) {
-      u[i + 0] = v[i + 0];
-      u[i + 1] = v[i + 1];
-      u[i + 6] = v[i + 6];
-      u[i + 7] = v[i + 7];
-    }
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
 
-    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
-    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
-    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
-    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
-    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
-    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
 
-    for (i = 32; i < 64; i += 16) {
-      for (j = i; j < i + 4; j++) {
-        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
-        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
-                      &clamp_hi);
-      }
-    }
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
 
-    // stage 8
-    for (i = 0; i < 4; ++i) {
-      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
-    }
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-    v[14] = u[14];
-    v[15] = u[15];
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
 
-    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32], bf0[32];
 
-    for (i = 16; i < 20; ++i) {
-      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
-      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
-                    &clamp_hi);
-    }
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
 
-    for (i = 32; i < 36; ++i) {
-      v[i] = u[i];
-      v[i + 12] = u[i + 12];
-      v[i + 16] = u[i + 16];
-      v[i + 28] = u[i + 28];
-    }
+  // stage 2
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = bf1[4];
+  bf0[5] = bf1[5];
+  bf0[6] = bf1[6];
+  bf0[7] = bf1[7];
+  bf0[8] = bf1[8];
+  bf0[9] = bf1[9];
+  bf0[10] = bf1[10];
+  bf0[11] = bf1[11];
+  bf0[12] = bf1[12];
+  bf0[13] = bf1[13];
+  bf0[14] = bf1[14];
+  bf0[15] = bf1[15];
+  bf0[16] =
+      half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+  bf0[17] =
+      half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+  bf0[31] =
+      half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
 
-    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
-    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
-    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
-    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
-    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
-    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
-    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
-    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
-    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
-    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+  // stage 3
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] =
+      half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+  bf1[9] =
+      half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+  bf1[15] =
+      half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+  addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
 
-    // stage 9
-    for (i = 0; i < 8; ++i) {
-      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
-    }
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] =
+      half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+  bf0[5] =
+      half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+  bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] =
+      half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] =
+      half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] =
+      half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] =
+      half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+  bf0[31] = bf1[31];
 
-    for (i = 16; i < 20; ++i) {
-      u[i] = v[i];
-      u[i + 12] = v[i + 12];
-    }
+  // stage 5
+  bf1[0] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+  bf1[1] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+  bf1[2] =
+      half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+  bf1[3] =
+      half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+  addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] =
+      half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] =
+      half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 
-    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
-    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
-    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
-    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
-    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
-    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
-    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
-    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] =
+      half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[7] = bf1[7];
+  addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] =
+      half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] =
+      half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
 
-    for (i = 32; i < 40; i++) {
-      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
-    }
+  // stage 7
+  addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] =
+      half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] =
+      half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
 
-    for (i = 48; i < 56; i++) {
-      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
-    }
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
 
-    // stage 10
-    for (i = 0; i < 16; i++) {
-      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
-    }
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    for (i = 32; i < 40; i++) v[i] = u[i];
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
-    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
-    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
-    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
-    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
-    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
-    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
-    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
-    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
-    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
-    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
-    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
-    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
-    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
-    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
-    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
 
-    for (i = 56; i < 64; i++) v[i] = u[i];
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default: assert(0);
+  }
+}
 
-    // stage 11
-    if (do_cols) {
-      for (i = 0; i < 32; i++) {
-        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
-                               &out[16 * (63 - i) + col]);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
+
+static const transform_1d_sse4_1
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+        { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+            NULL },
+          { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+          idct32x32_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+          idct64x64_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
       }
     } else {
-      for (i = 0; i < 32; i++) {
-        addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
-                            &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
-                            out_shift);
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
       }
     }
   }
-}
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
 
-void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
-                                     int stride, TX_TYPE tx_type, int bd) {
-  __m128i in[64 * 64 / 4], out[64 * 64 / 4];
-  const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
-  const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
-  const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
 
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd) {
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_64x64_lower_32x32(coeff, in);
-      transpose_64x64(in, out, 0);
-      idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                       -shift[0]);
-      transpose_64x64(in, out, 1);
-      idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
-      write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_sse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
       break;
+    default: assert(0); break;
+  }
+}
 
-    default:
-      av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
       break;
+    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 608bd88a4..e298cf653 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index b29bd1d79..6f24e5948 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
-#define _HIGHBD_TXFM_UTILITY_SSE4_H
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
 
 #include <smmintrin.h> /* SSE4.1 */
 
@@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
                 out[63]);
 }
 
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+  for (int j = 0; j < 8; j++) {
+    for (int i = 0; i < 8; i++) {
+      TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+                    input[i * 32 + j + 16], input[i * 32 + j + 24],
+                    output[j * 32 + i + 0], output[j * 32 + i + 8],
+                    output[j * 32 + i + 16], output[j * 32 + i + 24]);
+    }
+  }
+}
+
 // Note:
 //  rounding = 1 << (bit - 1)
 static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
@@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
   return x;
 }
 
-#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                    int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                        const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd);
+
+#endif  // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
index a08beaafd..4bcab0564 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = {
   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
 };
 
-static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
-                                     int sx, int alpha, int k,
-                                     const int offset_bits_horiz,
-                                     const int reduce_bits_horiz) {
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                          __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadu_si128(
       (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
@@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
 
   // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-  const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
   // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-  const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
   // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-  const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-  const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
-                                             ((1 << reduce_bits_horiz) >> 1));
-
-  // Calculate filtered results
-  const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
-  const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
-  const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
-  const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
-  __m128i res_even =
-      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
-  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                           _mm_cvtsi32_si128(reduce_bits_horiz));
+  coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
   // Filter odd-index pixels
   const __m128i tmp_1 = _mm_loadu_si128(
@@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
 
-  const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-  const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-  const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-  const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+  coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+    int sx, __m128i *coeff) {
+  // Filter coeff
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+  coeff[4] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+  coeff[6] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+  coeff[1] = coeff[0];
+  coeff[3] = coeff[2];
+  coeff[5] = coeff[4];
+  coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+    const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+    const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+  const __m128i src_1 = *src;
+  const __m128i src2_1 = *src2;
 
-  const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
-  const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
-  const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
-  const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+  const __m128i res_2 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+  const __m128i res_4 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+  const __m128i res_6 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  const __m128i res_1 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+  const __m128i res_3 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+  const __m128i res_5 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+  const __m128i res_7 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
 
   __m128i res_odd =
       _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
@@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
 }
 
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+                                       __m128i *tmp, int sx, int alpha, int k,
+                                       const int offset_bits_horiz,
+                                       const int reduce_bits_horiz) {
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+                           reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    __m128i coeff[8];
+    highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+                        reduce_bits_horiz);
+  }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    highbd_warp_horizontal_filter_alpha0_beta0(
+        ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha == 0 && beta != 0)
+    highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                         beta, p_height, height, i,
+                                         offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha != 0 && beta == 0)
+    highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else
+    highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+}
+
 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
                                    int width, int height, int stride,
                                    uint16_t *pred, int p_col, int p_row,
@@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
           const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
           const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
 
-          horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
-                            offset_bits_horiz, reduce_bits_horiz);
+          highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+                              offset_bits_horiz, reduce_bits_horiz);
         }
       } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src2 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
-          horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
-                            reduce_bits_horiz);
-        }
+        highbd_prepare_warp_horizontal_filter(
+            ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+            offset_bits_horiz, reduce_bits_horiz);
       }
 
       // Vertical filter
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index d1ea26290..9f2e2b457 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -13,7 +13,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
@@ -21,6 +20,21 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+  return _mm256_permute2x128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
 void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                              int dst_stride0, int w, int h,
                              const InterpFilterParams *filter_params_x,
@@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   (void)subpel_y_q4;
 
   for (i = 0; i < h; i += 2) {
+    const uint8_t *src_data = src_ptr + i * src_stride;
+    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
     for (j = 0; j < w; j += 8) {
-      const __m256i data = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
-          _mm256_castsi128_si256(_mm_loadu_si128(
-              (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
-          0x20);
+      const __m256i data =
+          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
 
       __m256i res = convolve_lowbd_x(data, coeffs, filt);
 
@@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
       // Accumulate values into the destination buffer
       if (do_average) {
-        const __m256i data_ref_0 = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-            _mm256_castsi128_si256(_mm_loadu_si128(
-                (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-            0x20);
-
+        const __m256i data_ref_0 =
+            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
         const __m256i comp_avg_res =
             comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
@@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m256i round_const =
       _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   for (j = 0; j < w; j += 16) {
     const uint8_t *data = &src_ptr[j];
     __m256i src6;
-
     // Load lines a and b. Line a to lower 128, line b to upper 128
-    const __m256i src_01a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        0x20);
-
-    const __m256i src_12a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        0x20);
-
-    const __m256i src_23a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        0x20);
-
-    const __m256i src_34a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        0x20);
-
-    const __m256i src_45a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        0x20);
-
-    src6 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-    const __m256i src_56a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        src6, 0x20);
-
-    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+    {
+      __m256i src_ab[7];
+      __m256i src_a[7];
+      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      for (int kk = 0; kk < 6; ++kk) {
+        data += src_stride;
+        src_a[kk + 1] =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+      }
+      src6 = src_a[6];
+      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+    }
 
     for (i = 0; i < h; i += 2) {
-      data = &src_ptr[i * src_stride + j];
-      const __m256i src_67a = _mm256_permute2x128_si256(
-          src6,
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          0x20);
+      data = &src_ptr[(i + 7) * src_stride + j];
+      const __m256i src7 =
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
 
       src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          src6, 0x20);
+          _mm_loadu_si128((__m128i *)(data + src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
 
       s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
       s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
@@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
       if (w - j < 16) {
         if (do_average) {
-          const __m256i data_ref_0 = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
               comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
 
@@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
             _mm256_add_epi16(res_hi_round, offset_const_2);
 
         if (do_average) {
-          const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
-          const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
-              0x20);
+          const __m256i data_ref_0_lo = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+          const __m256i data_ref_0_hi =
+              load_line2_avx2(&dst[i * dst_stride + j + 8],
+                              &dst[i * dst_stride + j + 8 + dst_stride]);
 
           const __m256i comp_avg_res_lo =
               comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
   const int offset_0 =
@@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
     {
+      const uint8_t *src_h = src_ptr + j;
       for (i = 0; i < im_h; i += 2) {
-        __m256i data = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+        __m256i data =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
         if (i + 1 < im_h)
           data = _mm256_inserti128_si256(
-              data,
-              _mm_loadu_si128(
-                  (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-              1);
+              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+        src_h += (src_stride << 1);
         __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
 
         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
@@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
 
           if (do_average) {
-            const __m256i data_ref_0 = _mm256_permute2x128_si256(
-                _mm256_castsi128_si256(
-                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-                _mm256_castsi128_si256(_mm_loadu_si128(
-                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-                0x20);
-
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
             const __m256i comp_avg_res =
                 comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
@@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
 
           if (do_average) {
-            const __m256i data_ref_0 = _mm256_permute2x128_si256(
-                _mm256_castsi128_si256(
-                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-                _mm256_castsi128_si256(_mm_loadu_si128(
-                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-                0x20);
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
 
             const __m256i comp_avg_res =
                 comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
   const __m256i zero = _mm256_setzero_si256();
 
   const int offset_0 =
@@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
 
         // Accumulate values into the destination buffer
         if (do_average) {
-          const __m256i data_ref_0 = _mm256_permute2x128_si256(
-              _mm256_castsi128_si256(
-                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
-              _mm256_castsi128_si256(_mm_loadu_si128(
-                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
-              0x20);
-
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
               comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
 
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
index ffbb31849..f645e0454 100644
--- a/third_party/aom/av1/common/x86/reconinter_avx2.c
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -16,8 +16,504 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "av1/common/blockd.h"
 
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+                                     const __m256i s1) {
+  const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+  return _mm256_abs_epi16(
+      _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = xx_loadl_32(src0);
+      const __m128i s0B = xx_loadl_32(src0 + stride0);
+      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+      const __m128i s1A = xx_loadl_32(src1);
+      const __m128i s1B = xx_loadl_32(src1 + stride1);
+      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      const __m128i x_m8 =
+          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+      xx_storeu_128(mask, x_m8);
+      src0 += (stride0 << 2);
+      src1 += (stride1 << 2);
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + stride0);
+      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + stride1);
+      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 2;
+      src1 += stride1 << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (16 == w) {
+    do {
+      const __m128i s0A = xx_load_128(src0);
+      const __m128i s0B = xx_load_128(src0 + stride0);
+      const __m128i s1A = xx_load_128(src1);
+      const __m128i s1B = xx_load_128(src1 + stride1);
+      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+      const __m256i m8 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 1;
+      src1 += stride1 << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m256i s0 = yy_loadu_256(src0 + j);
+        const __m256i s1 = yy_loadu_256(src1 + j);
+        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+        const __m256i s0H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+        const __m256i s1H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+        const __m256i m8 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+        yy_storeu_256(mask + j, m8);
+        j += 32;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+                                         const __m256i *data_src1,
+                                         const __m256i *round_const,
+                                         const __m256i *mask_base_16,
+                                         const __m256i *clip_diff, int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+                                             const __m256i *data_src1,
+                                             const __m256i *round_const,
+                                             const __m256i *mask_base_16,
+                                             const __m256i *clip_diff,
+                                             int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+  return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 =
+          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+  if (mask_type == DIFFWTD_38) {
+    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+                                         src1_stride, h, w, shift);
+  } else {
+    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+                                             src1_stride, h, w, shift);
+  }
+}
+
 void av1_build_compound_diffwtd_mask_highbd_avx2(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
     int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
index 375def62e..0aaf1f454 100644
--- a/third_party/aom/av1/common/x86/selfguided_avx2.c
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
   }
 }
 
-void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
-                                     int dgd_stride, int32_t *flt0,
-                                     int32_t *flt1, int flt_stride,
-                                     int sgr_params_idx, int bit_depth,
-                                     int highbd) {
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                    int dgd_stride, int32_t *flt0,
+                                    int32_t *flt1, int flt_stride,
+                                    int sgr_params_idx, int bit_depth,
+                                    int highbd) {
   // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
   // Ctl and Dtl is 32-byte aligned.
   const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
 
-  DECLARE_ALIGNED(32, int32_t,
-                  buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+  int32_t *buf = aom_memalign(
+      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+  if (!buf) return -1;
 
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
   }
+  aom_free(buf);
+  return 0;
 }
 
 void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
@@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
-  av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
-                                  width, eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_avx2(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index c64150b9d..ea3f6d942 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
   }
 }
 
-void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
-                                       int height, int dgd_stride,
-                                       int32_t *flt0, int32_t *flt1,
-                                       int flt_stride, int sgr_params_idx,
-                                       int bit_depth, int highbd) {
-  DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
-  memset(buf, 0, sizeof(buf));
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                      int height, int dgd_stride, int32_t *flt0,
+                                      int32_t *flt1, int flt_stride,
+                                      int sgr_params_idx, int bit_depth,
+                                      int highbd) {
+  int32_t *buf = (int32_t *)aom_memalign(
+      16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+  if (!buf) return -1;
+  memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
 
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
                  height, highbd);
   }
+  aom_free(buf);
+  return 0;
 }
 
 void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
@@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
-  av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
-                                    width, eps, bit_depth, highbd);
+  const int ret = av1_selfguided_restoration_sse4_1(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
   const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
   decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index efc542cbf..b810cea2e 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
 static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
                                       9, 11, 11, 13, 13, 15, 15, 0 };
 
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
-                                     int alpha, int k,
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                                   0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                                   2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                                   4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                                   6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                                  0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                                  4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                                  8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
-                                     const int reduce_bits_horiz) {
+                                     const int reduce_bits_horiz, int k) {
   const __m128i src_even =
       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
   const __m128i src_odd =
       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
 
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadl_epi64(
       (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
 
   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
 
-  // The pixel order we need for 'src' is:
-  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
-  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
-  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
-  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
-  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
-                                            _mm_srli_si128(src_odd, 4));
-  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
-  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
-  const __m128i src_13 =
-      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
-  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
-  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
-  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
-                                            _mm_srli_si128(src_even, 6));
-  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
 
-  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
-                                             ((1 << reduce_bits_horiz) >> 1));
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
 
-  // Note: The values res_02 + res_46 and res_13 + res_57 both
-  // fit into int16s at this point, but their sum may be too wide to fit
-  // into an int16. However, once we also add round_const, the sum of
-  // all of these fits into a uint16.
-  //
-  // The wrapping behaviour of _mm_add_* is used here to make sure we
-  // get the correct result despite converting between different
-  // (implicit) types.
-  const __m128i res_even = _mm_add_epi16(res_02, res_46);
-  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
-  const __m128i res =
-      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+                                          int stride, int32_t ix4, int32_t iy4,
+                                          int32_t sx4, int alpha, int beta,
+                                          int p_height, int height, int i,
+                                          const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                      reduce_bits_horiz);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+    __m128i coeff[4];
+    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+  *res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+                                                  __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // even coeffs
+  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  // odd coeffs
+  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+                                                         __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  // even coeffs
+  coeffs[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+  coeffs[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+  coeffs[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+  coeffs[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+  // odd coeffs
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+                                              __m128i *res_lo, __m128i *res_hi,
+                                              int k) {
+  // Load from tmp and rearrange pairs of consecutive rows into the
+  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+  const __m128i *src = tmp + (k + 4);
+  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+  const __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+  // Filter odd-index pixels
+  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+  const __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+    const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m128i res_lo_1 = *res_lo;
+  __m128i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+                              reduce_bits_vert);
+    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+    __m128i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      const __m128i p_16 = _mm_loadl_epi64(p);
+
+      if (conv_params->use_jnt_comp_avg) {
+        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+        const __m128i shifted_32 =
+            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+      }
+
+      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+                                 round_bits);
+      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+      *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+    } else {
+      _mm_storel_epi64(p, temp_lo_16);
+    }
+    if (p_width > 4) {
+      __m128i *const p4 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+                                reduce_bits_vert);
+      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+      __m128i res_hi_16;
+
+      if (conv_params->do_average) {
+        __m128i *const dst8_4 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+        if (conv_params->use_jnt_comp_avg) {
+          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+          const __m128i shifted_32 =
+              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+                                   round_bits);
+        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+        *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+      } else {
+        _mm_storel_epi64(p4, temp_hi_16);
+      }
+    }
+  } else {
+    const __m128i res_lo_round = _mm_srai_epi32(
+        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m128i res_hi_round = _mm_srai_epi32(
+        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+    // Note: If we're outputting a 4x4 block, we need to be very careful
+    // to only output 4 pixels at this point, to avoid encode/decode
+    // mismatches when encoding with multiple threads.
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+    } else {
+      _mm_storel_epi64(p, res_8bit);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  (void)gamma;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  (void)gamma;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0(
+        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else
+    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                         res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                 p_height, height, i, offset_bits_horiz,
+                                 reduce_bits_horiz);
+  else
+    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                           p_height, height, i, offset_bits_horiz,
+                           reduce_bits_horiz);
 }
 
 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
@@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
   const __m128i reduce_bits_vert_const =
       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const __m128i res_sub_const =
-      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
-                     (1 << (offset_bits - conv_params->round_1 - 1)));
-  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
-  __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
-
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
   /* Note: For this code to work, the left/right frame borders need to be
@@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
   }
   }*/
+  __m128i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                     ((1 << reduce_bits_vert) >> 1));
+  }
 
   for (i = 0; i < p_height; i += 8) {
     for (j = 0; j < p_width; j += 8) {
@@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
                             reduce_bits_horiz);
         }
       } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
-                            reduce_bits_horiz);
-        }
+        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                       beta, p_height, height, i,
+                                       offset_bits_horiz, reduce_bits_horiz);
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        if (conv_params->is_compound) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          res_lo = _mm_add_epi32(res_lo, res_add_const);
-          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
-                                 reduce_bits_vert_shift);
-          const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
-          __m128i res_lo_16;
-          if (conv_params->do_average) {
-            __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-            const __m128i p_16 = _mm_loadl_epi64(p);
-
-            if (conv_params->use_jnt_comp_avg) {
-              const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
-              const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
-              const __m128i shifted_32 =
-                  _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-              res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
-            } else {
-              res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
-            }
-
-            res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
-
-            res_lo_16 = _mm_sra_epi16(
-                _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
-            __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
-            *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
-          } else {
-            _mm_storel_epi64(p, temp_lo_16);
-          }
-          if (p_width > 4) {
-            __m128i *const p4 =
-                (__m128i *)&conv_params
-                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
-            res_hi = _mm_add_epi32(res_hi, res_add_const);
-            res_hi =
-                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
-                              reduce_bits_vert_shift);
-            const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
-            __m128i res_hi_16;
-
-            if (conv_params->do_average) {
-              __m128i *const dst8_4 =
-                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
-              const __m128i p4_16 = _mm_loadl_epi64(p4);
-
-              if (conv_params->use_jnt_comp_avg) {
-                const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
-                const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
-                const __m128i shifted_32 =
-                    _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-                res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
-              } else {
-                res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
-              }
-              res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
-
-              res_hi_16 = _mm_sra_epi16(
-                  _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
-              __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
-              *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
-
-            } else {
-              _mm_storel_epi64(p4, temp_hi_16);
-            }
-          }
-        } else {
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
-                             ((1 << reduce_bits_vert) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
-          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-          } else {
-            _mm_storel_epi64(p, res_8bit);
-          }
-        }
-      }
+      prepare_warp_vertical_filter(
+          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
     }
   }
 }
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
index e1449fd21..87a6e1239 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 
   DECLARE_ALIGNED(32, uint16_t,
                   temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
index 3083d224b..f9d00b733 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
 
   DECLARE_ALIGNED(16, uint16_t,
                   temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   int i, j;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-- 
cgit v1.2.3