From 68569dee1416593955c1570d638b3d9250b33012 Mon Sep 17 00:00:00 2001
From: trav90 <travawine@palemoon.org>
Date: Mon, 15 Oct 2018 21:45:30 -0500
Subject: Import aom library

This is the reference implementation for the Alliance for Open Media's av1 video code.

The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36.
---
 .../aom/av1/common/x86/av1_convolve_ssse3.c        | 1029 +++++++++++
 .../aom/av1/common/x86/av1_fwd_txfm1d_sse4.c       |  839 +++++++++
 .../aom/av1/common/x86/av1_fwd_txfm2d_sse4.c       |   81 +
 .../aom/av1/common/x86/av1_highbd_convolve_sse4.c  |  533 ++++++
 third_party/aom/av1/common/x86/av1_txfm1d_sse4.h   |  144 ++
 third_party/aom/av1/common/x86/filterintra_sse4.c  |  898 ++++++++++
 .../aom/av1/common/x86/highbd_inv_txfm_avx2.c      |  557 ++++++
 .../aom/av1/common/x86/highbd_inv_txfm_sse4.c      | 1398 +++++++++++++++
 .../aom/av1/common/x86/highbd_txfm_utility_sse4.h  |   92 +
 .../aom/av1/common/x86/highbd_warp_plane_ssse3.c   |  286 ++++
 .../aom/av1/common/x86/hybrid_inv_txfm_avx2.c      |  507 ++++++
 third_party/aom/av1/common/x86/idct_intrin_sse2.c  | 1402 +++++++++++++++
 third_party/aom/av1/common/x86/pvq_sse4.c          |  252 +++
 third_party/aom/av1/common/x86/pvq_sse4.h          |   13 +
 third_party/aom/av1/common/x86/selfguided_sse4.c   | 1805 ++++++++++++++++++++
 third_party/aom/av1/common/x86/warp_plane_sse2.c   |  297 ++++
 16 files changed, 10133 insertions(+)
 create mode 100644 third_party/aom/av1/common/x86/av1_convolve_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/filterintra_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
 create mode 100644 third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
 create mode 100644 third_party/aom/av1/common/x86/idct_intrin_sse2.c
 create mode 100644 third_party/aom/av1/common/x86/pvq_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/pvq_sse4.h
 create mode 100644 third_party/aom/av1/common/x86/selfguided_sse4.c
 create mode 100644 third_party/aom/av1/common/x86/warp_plane_sse2.c

(limited to 'third_party/aom/av1/common/x86')

diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
new file mode 100644
index 000000000..91102bbaf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#define WIDTH_BOUND (16)
+#define HEIGHT_BOUND (16)
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_12sharp_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
+#endif  // CONFIG_DUAL_FILTER
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
+
+DECLARE_ALIGNED(16, static int8_t,
+                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
+#endif
+
+typedef int8_t (*SubpelFilterCoeffs)[16];
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &sub_pel_filters_12sharp_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static INLINE SubpelFilterCoeffs
+get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
+  __m128i t0, t1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi32(t0, t1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(t0, t1);
+  out[3] = _mm_srli_si128(out[2], 8);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  out[4] = _mm_unpacklo_epi32(t0, t1);
+  out[5] = _mm_srli_si128(out[4], 8);
+  // Note: We ignore out[6] and out[7] because
+  // they're zero vectors.
+}
+
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
+
+static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i y = _mm_loadl_epi64((__m128i const *)src);
+  y = _mm_unpacklo_epi8(y, zero);
+  y = _mm_add_epi16(*x, y);
+  y = _mm_add_epi16(y, one);
+  y = _mm_srai_epi16(y, 1);
+  y = _mm_packus_epi16(y, y);
+  return y;
+}
+
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i u = _mm_packus_epi16(*x, *x);
+  temp = _mm_cvtsi128_si32(u);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i y = accumulate_store(x, dst);
+  temp = _mm_cvtsi128_si32(y);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
+                                           accumulate_store_2_pixel };
+
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  *(int *)dst = _mm_cvtsi128_si32(u);
+}
+
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  *(int *)dst = _mm_cvtsi128_si32(y);
+}
+
+static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
+                                           accumulate_store_4_pixel };
+
+static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store_func, uint8_t *dst) {
+  __m128i sumPairRow[4];
+  __m128i sumPairCol[8];
+  __m128i pixel;
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+
+  if (10 == tapsNum) {
+    src -= 1;
+  }
+
+  pixel = _mm_loadu_si128((__m128i const *)src);
+  sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
+
+  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
+  sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
+
+  transpose_4x8(sumPairRow, sumPairCol);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
+  sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
+
+  sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
+  sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
+
+  sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
+  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
+  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
+
+  store_func(&sumPairRow[1], dst);
+}
+
+static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store, uint8_t *buf) {
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+  src += 4;
+  buf += 4;
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+  src += 8;
+  buf += 8;
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+  src += 16;
+  buf += 16;
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+  src += 32;
+  buf += 32;
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                             store_pixel_t store, uint8_t *buf) {
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+  src += 64;
+  buf += 64;
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
+                           uint8_t *) = {
+  horiz_w4_ssse3,  horiz_w8_ssse3,  horiz_w16_ssse3,
+  horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
+};
+
+static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
+                               int width, store_pixel_t store, uint8_t *dst) {
+  switch (width) {
+    // Note:
+    // For width=2 and 4, store function must be different
+    case 2:
+    case 4: horizTab[0](src, f, tapsNum, store, dst); break;
+    case 8: horizTab[1](src, f, tapsNum, store, dst); break;
+    case 16: horizTab[2](src, f, tapsNum, store, dst); break;
+    case 32: horizTab[3](src, f, tapsNum, store, dst); break;
+    case 64: horizTab[4](src, f, tapsNum, store, dst); break;
+    case 128: horizTab[5](src, f, tapsNum, store, dst); break;
+    default: assert(0);
+  }
+}
+
+// Vertical 8-pixel parallel
+typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride);
+
+static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i *)dst, u0);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
+
+  u0 = _mm_unpacklo_epi8(u0, zero);
+  u1 = _mm_unpacklo_epi8(u1, zero);
+  u2 = _mm_unpacklo_epi8(u2, zero);
+  u3 = _mm_unpacklo_epi8(u3, zero);
+  u4 = _mm_unpacklo_epi8(u4, zero);
+  u5 = _mm_unpacklo_epi8(u5, zero);
+  u6 = _mm_unpacklo_epi8(u6, zero);
+  u7 = _mm_unpacklo_epi8(u7, zero);
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+  v4 = _mm_unpacklo_epi8(v4, zero);
+  v5 = _mm_unpacklo_epi8(v5, zero);
+  v6 = _mm_unpacklo_epi8(v6, zero);
+  v7 = _mm_unpacklo_epi8(v7, zero);
+
+  v0 = _mm_adds_epi16(u0, v0);
+  v1 = _mm_adds_epi16(u4, v1);
+  v2 = _mm_adds_epi16(u1, v2);
+  v3 = _mm_adds_epi16(u5, v3);
+  v4 = _mm_adds_epi16(u2, v4);
+  v5 = _mm_adds_epi16(u6, v5);
+  v6 = _mm_adds_epi16(u3, v6);
+  v7 = _mm_adds_epi16(u7, v7);
+
+  v0 = _mm_adds_epi16(v0, one);
+  v1 = _mm_adds_epi16(v1, one);
+  v2 = _mm_adds_epi16(v2, one);
+  v3 = _mm_adds_epi16(v3, one);
+  v4 = _mm_adds_epi16(v4, one);
+  v5 = _mm_adds_epi16(v5, one);
+  v6 = _mm_adds_epi16(v6, one);
+  v7 = _mm_adds_epi16(v7, one);
+
+  v0 = _mm_srai_epi16(v0, 1);
+  v1 = _mm_srai_epi16(v1, 1);
+  v2 = _mm_srai_epi16(v2, 1);
+  v3 = _mm_srai_epi16(v3, 1);
+  v4 = _mm_srai_epi16(v4, 1);
+  v5 = _mm_srai_epi16(v5, 1);
+  v6 = _mm_srai_epi16(v6, 1);
+  v7 = _mm_srai_epi16(v7, 1);
+
+  u0 = _mm_packus_epi16(v0, v1);
+  u1 = _mm_packus_epi16(v2, v3);
+  u2 = _mm_packus_epi16(v4, v5);
+  u3 = _mm_packus_epi16(v6, v7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i *)dst, u0);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
+}
+
+static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
+                                             transpose8x8_accumu_to_dst };
+
+static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
+  __m128i t0, t1, t2, t3, u0, u1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+  t2 = _mm_unpacklo_epi16(in[4], in[5]);
+  t3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[0] = _mm_unpacklo_epi64(u0, u1);
+  out[1] = _mm_unpackhi_epi64(u0, u1);
+
+  u0 = _mm_unpackhi_epi32(t0, t1);
+  u1 = _mm_unpackhi_epi32(t2, t3);
+
+  out[2] = _mm_unpacklo_epi64(u0, u1);
+  out[3] = _mm_unpackhi_epi64(u0, u1);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+  t2 = _mm_unpackhi_epi16(in[4], in[5]);
+  t3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[4] = _mm_unpacklo_epi64(u0, u1);
+  out[5] = _mm_unpackhi_epi64(u0, u1);
+
+  // Ignore out[6] and out[7]
+  // they're zero vectors.
+}
+
+static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i s[8], t[6];
+  __m128i min_x2x3, max_x2x3;
+  __m128i temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
+  s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+  s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  transpose_8x16(s, t);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  s[0] = _mm_maddubs_epi16(t[0], f[0]);
+  s[1] = _mm_maddubs_epi16(t[1], f[1]);
+  s[2] = _mm_maddubs_epi16(t[2], f[2]);
+  s[3] = _mm_maddubs_epi16(t[3], f[3]);
+  s[4] = _mm_maddubs_epi16(t[4], f[4]);
+  s[5] = _mm_maddubs_epi16(t[5], f[5]);
+
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(s[2], s[3]);
+  max_x2x3 = _mm_max_epi16(s[2], s[3]);
+  temp = _mm_adds_epi16(s[0], s[1]);
+  temp = _mm_adds_epi16(temp, s[5]);
+  temp = _mm_adds_epi16(temp, s[4]);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+
+  _mm_storeu_si128((__m128i *)buf, temp);
+}
+
+// Vertical 4-pixel parallel
+static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
+                                              int src_stride, uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u2 = _mm_packus_epi16(u0, u1);
+  u0 = _mm_unpacklo_epi8(u2, zero);
+  u1 = _mm_unpackhi_epi8(u2, zero);
+
+  // load pixel values
+  v0 = _mm_loadl_epi64((__m128i const *)(dst));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+
+  v0 = _mm_unpacklo_epi64(v0, v1);
+  v1 = _mm_unpacklo_epi64(v2, v3);
+
+  u0 = _mm_adds_epi16(u0, v0);
+  u1 = _mm_adds_epi16(u1, v1);
+
+  u0 = _mm_adds_epi16(u0, one);
+  u1 = _mm_adds_epi16(u1, one);
+
+  u0 = _mm_srai_epi16(u0, 1);
+  u1 = _mm_srai_epi16(u1, 1);
+
+  // saturation and pack to pixels
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
+                                             transpose4x4_accumu_to_dst };
+
+static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i A, B, C, D;
+  __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
+  __m128i x0, x1, x2, x3, x4, x5;
+  __m128i min_x2x3, max_x2x3, temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  A = _mm_loadu_si128((const __m128i *)src_ptr);
+  B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 02 03 12 13 22 23 32 33
+  s3s2 = _mm_srli_si128(s1s0, 8);
+  // 06 07 16 17 26 27 36 37
+  s7s6 = _mm_srli_si128(s5s4, 8);
+
+  tr0_0 = _mm_unpackhi_epi16(A, B);
+  tr0_1 = _mm_unpackhi_epi16(C, D);
+  s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  sbsa = _mm_srli_si128(s9s8, 8);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  x0 = _mm_maddubs_epi16(s1s0, f[0]);
+  x1 = _mm_maddubs_epi16(s3s2, f[1]);
+  x2 = _mm_maddubs_epi16(s5s4, f[2]);
+  x3 = _mm_maddubs_epi16(s7s6, f[3]);
+  x4 = _mm_maddubs_epi16(s9s8, f[4]);
+  x5 = _mm_maddubs_epi16(sbsa, f[5]);
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(x2, x3);
+  max_x2x3 = _mm_max_epi16(x2, x3);
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x5);
+  temp = _mm_adds_epi16(temp, x4);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+  _mm_storel_epi64((__m128i *)buf, temp);
+}
+
+// Note:
+//  This function assumes:
+// (1) 10/12-taps filters
+// (2) x_step_q4 = 16 then filter is fixed at the call
+
+void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, int w, int h,
+                              const InterpFilterParams filter_params,
+                              const int subpel_x_q4, int x_step_q4,
+                              ConvolveParams *conv_params) {
+  DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
+  __m128i verf[6];
+  __m128i horf[2];
+  SubpelFilterCoeffs hCoeffs, vCoeffs;
+  const uint8_t *src_ptr;
+  store_pixel_t store2p = store2pixelTab[conv_params->ref];
+  store_pixel_t store4p = store4pixelTab[conv_params->ref];
+  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
+  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
+
+  const int tapsNum = filter_params.taps;
+  int block_height, block_residu;
+  int i, col, count;
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_x_q4, x_step_q4, conv_params);
+    return;
+  }
+
+  hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+
+  if (!hCoeffs || !vCoeffs) {
+    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_x_q4, x_step_q4, conv_params);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  horf[0] = *((const __m128i *)(hCoeffs));
+  horf[1] = *((const __m128i *)(hCoeffs + 1));
+
+  count = 0;
+
+  // here tapsNum is filter size
+  src -= (tapsNum >> 1) - 1;
+  src_ptr = src;
+  if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
+    // 8-pixels parallel
+    block_height = h >> 3;
+    block_residu = h & 7;
+
+    do {
+      for (col = 0; col < w; col += 8) {
+        for (i = 0; i < 8; ++i) {
+          filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                 temp + (i * 8));
+          src_ptr += 1;
+        }
+        transpose_8x8(temp, 8, dst + col, dst_stride);
+      }
+      count++;
+      src_ptr = src + count * src_stride * 8;
+      dst += dst_stride * 8;
+    } while (count < block_height);
+
+    for (i = 0; i < block_residu; ++i) {
+      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+      src_ptr += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    if (w > 2) {
+      // 4-pixels parallel
+      block_height = h >> 2;
+      block_residu = h & 3;
+
+      do {
+        for (col = 0; col < w; col += 4) {
+          for (i = 0; i < 4; ++i) {
+            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                   temp + (i * 4));
+            src_ptr += 1;
+          }
+          transpose_4x4(temp, 4, dst + col, dst_stride);
+        }
+        count++;
+        src_ptr = src + count * src_stride * 4;
+        dst += dst_stride * 4;
+      } while (count < block_height);
+
+      for (i = 0; i < block_residu; ++i) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    } else {
+      for (i = 0; i < h; i++) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    }
+  }
+}
+
+// Vertical convolution filtering
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  _mm_storel_epi64((__m128i *)dst, u);
+}
+
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  _mm_storel_epi64((__m128i *)dst, y);
+}
+
+static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
+                                           accumulate_store_8_pixel };
+
+static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
+                                 int tapsNum, __m128i *f) {
+  __m128i s[12];
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i min_x2x3, max_x2x3, sum;
+  int i = 0;
+  int r = 0;
+
+  if (10 == tapsNum) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
+
+  s[0] = _mm_maddubs_epi16(s[0], f[0]);
+  s[2] = _mm_maddubs_epi16(s[2], f[1]);
+  s[4] = _mm_maddubs_epi16(s[4], f[2]);
+  s[6] = _mm_maddubs_epi16(s[6], f[3]);
+  s[8] = _mm_maddubs_epi16(s[8], f[4]);
+  s[10] = _mm_maddubs_epi16(s[10], f[5]);
+
+  min_x2x3 = _mm_min_epi16(s[4], s[6]);
+  max_x2x3 = _mm_max_epi16(s[4], s[6]);
+  sum = _mm_adds_epi16(s[0], s[2]);
+  sum = _mm_adds_epi16(sum, s[10]);
+  sum = _mm_adds_epi16(sum, s[8]);
+
+  sum = _mm_adds_epi16(sum, min_x2x3);
+  sum = _mm_adds_epi16(sum, max_x2x3);
+
+  sum = _mm_mulhrs_epi16(sum, k_256);
+  sum = _mm_packus_epi16(sum, sum);
+  sum = _mm_unpacklo_epi8(sum, zero);
+  return sum;
+}
+
+static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
+                                             __m128i *f, int tapsNum,
+                                             store_pixel_t store_func,
+                                             uint8_t *dst) {
+  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
+  store_func(&sum, dst);
+}
+
+static void filter_vert_compute_small(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int rowIndex = 0;
+  do {
+    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
+                                     dst);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+static void filter_vert_compute_large(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int w, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int col;
+  int rowIndex = 0;
+  const uint8_t *src_ptr = src;
+  uint8_t *dst_ptr = dst;
+
+  do {
+    for (col = 0; col < w; col += 8) {
+      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
+                                       store_func, dst_ptr);
+      src_ptr += 8;
+      dst_ptr += 8;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams filter_params,
+                             const int subpel_y_q4, int y_step_q4,
+                             ConvolveParams *conv_params) {
+  __m128i verf[6];
+  SubpelFilterCoeffs vCoeffs;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr = dst;
+  store_pixel_t store2p = store2pixelTab[conv_params->ref];
+  store_pixel_t store4p = store4pixelTab[conv_params->ref];
+  store_pixel_t store8p = store8pixelTab[conv_params->ref];
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_y_q4, y_step_q4, conv_params);
+    return;
+  }
+
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+
+  if (!vCoeffs) {
+    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_y_q4, y_step_q4, conv_params);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+  src_ptr = src;
+
+  if (w > 4) {
+    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
+                              dst_ptr, dst_stride);
+  } else if (4 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
+                              dst_ptr, dst_stride);
+  } else if (2 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
+                              dst_ptr, dst_stride);
+  } else {
+    assert(0);
+  }
+}
+
+static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
+                                   int8_t (*simd_horiz_filter)[2][16]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  const int16_t *filter_row;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    int i;
+    filter_row = filter_ptr + shift * taps;
+    for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+    for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
+
+    for (i = 0; i < taps; ++i) {
+      simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
+      simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
+    }
+
+    for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
+
+    for (i = offset + 2 + taps; i < 16; ++i)
+      simd_horiz_filter[shift - 1][1][i] = 0;
+  }
+}
+
+static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
+                                  int8_t (*simd_vert_filter)[6][16]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  const int16_t *filter_row;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    int i;
+    filter_row = filter_ptr + shift * taps;
+    for (i = 0; i < 6; ++i) {
+      int j;
+      for (j = 0; j < 16; ++j) {
+        int c = i * 2 + (j % 2) - offset;
+        if (c >= 0 && c < taps)
+          simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
+        else
+          simd_vert_filter[shift - 1][i][j] = 0;
+      }
+    }
+  }
+}
+
+typedef struct SimdFilter {
+  InterpFilter interp_filter;
+  int8_t (*simd_horiz_filter)[2][16];
+  int8_t (*simd_vert_filter)[6][16];
+} SimdFilter;
+
+#if CONFIG_DUAL_FILTER
+#define MULTITAP_FILTER_NUM 1
+SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
+  { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
+    &sub_pel_filters_12sharp_ver_signal_dir[0] },
+};
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+SimdFilter temporal_simd_filter = {
+  TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
+  &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
+};
+#endif
+
+void av1_lowbd_convolve_init_ssse3(void) {
+#if USE_TEMPORALFILTER_12TAP
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_horiz_filter(filter_ptr, taps,
+                           temporal_simd_filter.simd_horiz_filter);
+    init_simd_vert_filter(filter_ptr, taps,
+                          temporal_simd_filter.simd_vert_filter);
+  }
+#endif
+#if CONFIG_DUAL_FILTER
+  {
+    int i;
+    for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
+      InterpFilter interp_filter = simd_filters[i].interp_filter;
+      InterpFilterParams filter_params =
+          av1_get_interp_filter_params(interp_filter);
+      int taps = filter_params.taps;
+      const int16_t *filter_ptr = filter_params.filter_ptr;
+      init_simd_horiz_filter(filter_ptr, taps,
+                             simd_filters[i].simd_horiz_filter);
+      init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
+    }
+  }
+#endif
+  return;
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..d04b667f1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,839 @@
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+    buf1[8] = buf0[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+    buf0[16] = buf1[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[31] = buf1[31];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+
+    // stage 9
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[16];
+    buf1[2] = buf0[8];
+    buf1[3] = buf0[24];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[20];
+    buf1[6] = buf0[12];
+    buf1[7] = buf0[28];
+    buf1[8] = buf0[2];
+    buf1[9] = buf0[18];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[26];
+    buf1[12] = buf0[6];
+    buf1[13] = buf0[22];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[30];
+    buf1[16] = buf0[1];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[9];
+    buf1[19] = buf0[25];
+    buf1[20] = buf0[5];
+    buf1[21] = buf0[21];
+    buf1[22] = buf0[13];
+    buf1[23] = buf0[29];
+    buf1[24] = buf0[3];
+    buf1[25] = buf0[19];
+    buf1[26] = buf0[11];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[7];
+    buf1[29] = buf0[23];
+    buf1[30] = buf0[15];
+    buf1[31] = buf0[31];
+
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[31];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[29];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[27];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[25];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[23];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[21];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[19];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[17];
+    buf1[15] = buf0[14];
+    buf1[16] = buf0[15];
+    buf1[17] = buf0[16];
+    buf1[18] = buf0[13];
+    buf1[19] = buf0[18];
+    buf1[20] = buf0[11];
+    buf1[21] = buf0[20];
+    buf1[22] = buf0[9];
+    buf1[23] = buf0[22];
+    buf1[24] = buf0[7];
+    buf1[25] = buf0[24];
+    buf1[26] = buf0[5];
+    buf1[27] = buf0[26];
+    buf1[28] = buf0[3];
+    buf1[29] = buf0[28];
+    buf1[30] = buf0[1];
+    buf1[31] = buf0[30];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 7
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 9
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 11
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+    buf1[2] = buf0[24];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[4] = buf0[12];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+    buf1[6] = buf0[20];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[8] = buf0[6];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+    buf1[10] = buf0[30];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[12] = buf0[10];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+    buf1[14] = buf0[18];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[16] = buf0[3];
+    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+    buf1[18] = buf0[27];
+    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[20] = buf0[15];
+    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+    buf1[22] = buf0[23];
+    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[24] = buf0[5];
+    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+    buf1[26] = buf0[29];
+    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[28] = buf0[9];
+    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+    buf1[30] = buf0[17];
+    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..78c261374
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
+    case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
new file mode 100644
index 000000000..cf6249bdc
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
+#if CONFIG_DUAL_FILTER
+DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
+#endif
+
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
+#endif
+
+typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+
+typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
+                              int src_stride, uint16_t *dst, int dst_stride,
+                              int bd);
+
+static INLINE HbdSubpelFilterCoeffs
+hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+#if CONFIG_DUAL_FILTER
+  if (p.interp_filter == MULTITAP_SHARP) {
+    return &subpel_filters_sharp[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &subpel_temporalfilter[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+static void init_simd_filter(const int16_t *filter_ptr, int taps,
+                             int16_t (*simd_filter)[6][8]) {
+  int shift;
+  int offset = (12 - taps) / 2;
+  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
+    const int16_t *filter_row = filter_ptr + shift * taps;
+    int i, j;
+    for (i = 0; i < 12; ++i) {
+      for (j = 0; j < 4; ++j) {
+        int r = i / 2;
+        int c = j * 2 + (i % 2);
+        if (i - offset >= 0 && i - offset < taps)
+          simd_filter[shift - 1][r][c] = filter_row[i - offset];
+        else
+          simd_filter[shift - 1][r][c] = 0;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_init_sse4_1(void) {
+#if USE_TEMPORALFILTER_12TAP
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
+  }
+#endif
+#if CONFIG_DUAL_FILTER
+  {
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(MULTITAP_SHARP);
+    int taps = filter_params.taps;
+    const int16_t *filter_ptr = filter_params.filter_ptr;
+    init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
+  }
+#endif
+}
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
+                       int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride, int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
+                          int src_stride, uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+  __m128i x0, x1;
+
+  x0 = _mm_unpacklo_epi32(in[0], in[1]);
+  x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi64(x0, x1);
+  out[1] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpackhi_epi32(in[0], in[1]);
+  x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  out[2] = _mm_unpacklo_epi64(x0, x1);
+  out[3] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpacklo_epi32(in[4], in[5]);
+  x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+  out[4] = _mm_unpacklo_epi64(x0, x1);
+  out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
+                                int tapsNum, uint32_t *buf) {
+  __m128i u[8], v[6];
+
+  if (tapsNum == 10) {
+    src -= 1;
+  }
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+  transpose_pair(u, v);
+
+  u[0] = _mm_madd_epi16(v[0], f[0]);
+  u[1] = _mm_madd_epi16(v[1], f[1]);
+  u[2] = _mm_madd_epi16(v[2], f[2]);
+  u[3] = _mm_madd_epi16(v[3], f[3]);
+  u[4] = _mm_madd_epi16(v[4], f[4]);
+  u[5] = _mm_madd_epi16(v[5], f[5]);
+
+  u[6] = _mm_min_epi32(u[2], u[3]);
+  u[7] = _mm_max_epi32(u[2], u[3]);
+
+  u[0] = _mm_add_epi32(u[0], u[1]);
+  u[0] = _mm_add_epi32(u[0], u[5]);
+  u[0] = _mm_add_epi32(u[0], u[4]);
+  u[0] = _mm_add_epi32(u[0], u[6]);
+  u[0] = _mm_add_epi32(u[0], u[7]);
+
+  _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_x_q4, int x_step_q4,
+                                      int avg, int bd) {
+  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const uint16_t *srcPtr;
+  const int tapsNum = filter_params.taps;
+  int i, col, count, blkResidu, blkHeight;
+  TransposeSave transSave = transSaveTab[avg];
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
+    return;
+  }
+
+  vCoeffs =
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+  if (!vCoeffs) {
+    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= (tapsNum >> 1) - 1;
+  srcPtr = src;
+
+  count = 0;
+  blkHeight = h >> 2;
+  blkResidu = h & 3;
+
+  while (blkHeight != 0) {
+    for (col = 0; col < w; col += 4) {
+      for (i = 0; i < 4; ++i) {
+        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+        srcPtr += 1;
+      }
+      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+    }
+    count++;
+    srcPtr = src + count * src_stride * 4;
+    dst += dst_stride * 4;
+    blkHeight--;
+  }
+
+  if (blkResidu == 0) return;
+
+  for (col = 0; col < w; col += 4) {
+    for (i = 0; i < 4; ++i) {
+      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+      srcPtr += 1;
+    }
+    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+  }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+                                       const __m128i *f, int taps,
+                                       uint16_t *dst, WritePixels saveFunc,
+                                       int bd) {
+  __m128i s[12];
+  __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  int r = 0;
+
+  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  if (10 == taps) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+  s[0] = _mm_madd_epi16(s[0], f[0]);
+  s[2] = _mm_madd_epi16(s[2], f[1]);
+  s[4] = _mm_madd_epi16(s[4], f[2]);
+  s[6] = _mm_madd_epi16(s[6], f[3]);
+  s[8] = _mm_madd_epi16(s[8], f[4]);
+  s[10] = _mm_madd_epi16(s[10], f[5]);
+
+  s[1] = _mm_min_epi32(s[4], s[6]);
+  s[3] = _mm_max_epi32(s[4], s[6]);
+
+  s[0] = _mm_add_epi32(s[0], s[2]);
+  s[0] = _mm_add_epi32(s[0], s[10]);
+  s[0] = _mm_add_epi32(s[0], s[8]);
+  s[0] = _mm_add_epi32(s[0], s[1]);
+  s[0] = _mm_add_epi32(s[0], s[3]);
+
+  saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+                                             int src_stride, const __m128i *f,
+                                             int taps, int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int col;
+  int rowIndex = 0;
+  const uint16_t *src_ptr = src;
+  uint16_t *dst_ptr = dst;
+  const int step = 4;
+  WritePixels write4pixels = write4pixelsTab[avg];
+
+  do {
+    for (col = 0; col < w; col += step) {
+      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
+                                 write4pixels, bd);
+      src_ptr += step;
+      dst_ptr += step;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+                                             int src_stride, const __m128i *f,
+                                             int taps, int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int rowIndex = 0;
+  WritePixels write2pixels = write2pixelsTab[avg];
+  (void)w;
+
+  do {
+    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+                                     uint16_t *dst, int dst_stride, int w,
+                                     int h,
+                                     const InterpFilterParams filter_params,
+                                     const int subpel_y_q4, int y_step_q4,
+                                     int avg, int bd) {
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    return;
+  }
+
+  vCoeffs =
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+  if (!vCoeffs) {
+    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+
+  if (w > 2) {
+    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
+                                     dst_stride, avg, bd);
+  } else {
+    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
+                                     dst_stride, avg, bd);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..af7afb7ee
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 000000000..4f77da446
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#if USE_3TAP_INTRA_FILTER
+void filterintra_sse4_3tap_dummy_func(void);
+void filterintra_sse4_3tap_dummy_func(void) {}
+#else
+
+static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+}
+
+static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+
+  u0 = _mm_unpackhi_epi8(a, zero);
+  u1 = _mm_unpackhi_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(sum[0], u0);
+  sum[0] = _mm_add_epi16(sum[0], u1);
+}
+
+static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector[0]);
+  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
+
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector[0], 2);
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+
+  sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
+                                         const uint8_t *left, int bs,
+                                         __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
+    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
+    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
+    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
+//
+static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
+  const TX_SIZE tx_size =
+      (bs == 32) ? TX_32X32
+                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  // c0
+  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0]);
+  // c1
+  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1]);
+  // c2
+  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2]);
+  // c3
+  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3]);
+}
+
+static const int maxBlkSize = 32;
+
+static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
+                               ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
+  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
+  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
+
+  p0 = _mm_add_epi32(p0, mean[0]);
+  p1 = _mm_add_epi32(p1, mean[0]);
+  p2 = _mm_add_epi32(p2, mean[0]);
+  p3 = _mm_add_epi32(p3, mean[0]);
+
+  p0 = _mm_packus_epi32(p0, p1);
+  p1 = _mm_packus_epi32(p2, p3);
+  p0 = _mm_packus_epi16(p0, p1);
+
+  *((int *)dst) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
+}
+
+static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
+                        ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 8) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    r += 1;
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 16) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  int r = 0;
+
+  while (r < 32) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
+    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
+    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
+    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p4 = _mm_add_epi32(p4, mean[0]);
+    p5 = _mm_add_epi32(p5, mean[0]);
+    p6 = _mm_add_epi32(p6, mean[0]);
+    p7 = _mm_add_epi32(p7, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    p4 = _mm_packus_epi32(p4, p5);
+    p5 = _mm_packus_epi32(p6, p7);
+    p4 = _mm_packus_epi16(p4, p5);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+
+    _mm_storel_epi64((__m128i *)(dst + 16), p4);
+    p4 = _mm_srli_si128(p4, 8);
+    _mm_storel_epi64((__m128i *)(dst + 24), p4);
+
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
+                           ptrdiff_t stride) {
+  switch (bs) {
+    case 4: SavePred4x4(pred, mean, dst, stride); break;
+    case 8: SavePred8x8(pred, mean, dst, stride); break;
+    case 16: SavePred16x16(pred, mean, dst, stride); break;
+    case 32: SavePred32x32(pred, mean, dst, stride); break;
+    default: assert(0);
+  }
+}
+
+typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
+                                  const int predStride);
+
+static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
+                              const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+
+  sum = _mm_extract_epi32(u0, 3);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 4) = x;
+}
+
+static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
+                               const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+}
+
+static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+}
+
+static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+}
+
+static ProducePixelsFunc prodPixelsFuncTab[4] = {
+  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
+};
+
+static void ProducePixels(int *pred, const __m128i *prm, int remain) {
+  __m128i p[3];
+  const int predStride = (maxBlkSize << 1) + 1;
+  int index;
+
+  p[0] = _mm_loadu_si128((const __m128i *)pred);
+  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
+  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
+
+  if (remain <= 2) {
+    return;
+  }
+  if (remain > 5) {
+    index = 3;
+  } else {
+    index = remain - 3;
+  }
+  prodPixelsFuncTab[index](p, prm, pred, predStride);
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
+                               const int bs, const __m128i *prm, int meanValue,
+                               uint8_t *dst, ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
+}
+
+static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
+                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
+  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
+}
+
+void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+// ============== High Bit Depth ==============
+#if CONFIG_HIGHBITDEPTH
+static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  Process 16 pixels above and left, 10-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  sum[0] = _mm_add_epi16(a, l);
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+  sum[0] = _mm_add_epi16(sum[0], a);
+  sum[0] = _mm_add_epi16(sum[0], l);
+}
+
+// Note:
+//  Process 16 pixels above and left, 12-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v0, v1;
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(v0, v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+}
+
+static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector);
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector, 2);
+    sum_vector = _mm_add_epi16(sum_vector, u);
+    sum_value = _mm_extract_epi16(sum_vector, 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector);
+
+    sum_vector = _mm_hadd_epi32(sum_vector, zero);
+    u = _mm_srli_si128(sum_vector, 4);
+    sum_vector = _mm_add_epi32(u, sum_vector);
+    sum_value = _mm_extract_epi32(sum_vector, 0);
+  }
+
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector[0]);
+    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector[0], 2);
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+    sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector[0]);
+    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
+    u = _mm_srli_si128(sum_vector[0], 4);
+    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
+    sum_value = _mm_extract_epi32(sum_vector[0], 0);
+  }
+
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
+                                               const uint16_t *left, int bs,
+                                               const int bd, __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
+    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
+    case 16:
+      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
+      break;
+    case 32:
+      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
+      break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void HighbdGeneratePrediction(const uint16_t *above,
+                                     const uint16_t *left, const int bs,
+                                     const int bd, const __m128i *prm,
+                                     int meanValue, uint16_t *dst,
+                                     ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+  int ipred;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + meanValue;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
+                                   int bs, const int bd, __m128i *prm,
+                                   uint16_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
+  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
+}
+
+void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+#endif  // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 000000000..d10f1ccc2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+// Note:
+//  Total 32x4 registers to represent 32x32 block coefficients.
+//  For high bit depth, each coefficient is 4-byte.
+//  Each __m256i register holds 8 coefficients.
+//  So each "row" we needs 4 register. Totally 32 rows
+//  Register layout:
+//   v0,   v1,   v2,   v3,
+//   v4,   v5,   v6,   v7,
+//   ... ...
+//   v124, v125, v126, v127
+
+static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0], in[4]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+
+  u2 = _mm256_unpacklo_epi32(in[8], in[12]);
+  u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+
+  u4 = _mm256_unpacklo_epi32(in[16], in[20]);
+  u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+
+  u6 = _mm256_unpacklo_epi32(in[24], in[28]);
+  u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
+  transpose_32x32_8x8(&in[0], &out[0]);
+  transpose_32x32_8x8(&in[1], &out[32]);
+  transpose_32x32_8x8(&in[32], &out[1]);
+  transpose_32x32_8x8(&in[33], &out[33]);
+}
+
+static void transpose_32x32(const __m256i *in, __m256i *out) {
+  transpose_32x32_16x16(&in[0], &out[0]);
+  transpose_32x32_16x16(&in[2], &out[64]);
+  transpose_32x32_16x16(&in[64], &out[2]);
+  transpose_32x32_16x16(&in[66], &out[66]);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+  int i;
+  for (i = 0; i < 128; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)coeff);
+    coeff += 8;
+  }
+}
+
+static void round_shift_32x32(__m256i *in, int shift) {
+  __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+  int i = 0;
+
+  while (i < 128) {
+    in[i] = _mm256_add_epi32(in[i], rnding);
+    in[i] = _mm256_srai_epi32(in[i], shift);
+    i++;
+  }
+}
+
+static __m256i highbd_clamp_epi32(__m256i x, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(x, max);
+  clamped = _mm256_andnot_si256(mask, x);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
+  const __m256i zero = _mm256_setzero_si256();
+  int i = 0;
+  (void)fliplr;
+  (void)flipud;
+
+  round_shift_32x32(in, shift);
+
+  while (i < 128) {
+    u0 = _mm256_loadu_si256((const __m256i *)output);
+    u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
+
+    x0 = _mm256_unpacklo_epi16(u0, zero);
+    x1 = _mm256_unpackhi_epi16(u0, zero);
+    x2 = _mm256_unpacklo_epi16(u1, zero);
+    x3 = _mm256_unpackhi_epi16(u1, zero);
+
+    v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
+    v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
+    v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
+    v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
+
+    v0 = _mm256_add_epi32(v0, x0);
+    v1 = _mm256_add_epi32(v1, x1);
+    v2 = _mm256_add_epi32(v2, x2);
+    v3 = _mm256_add_epi32(v3, x3);
+
+    v0 = _mm256_packus_epi32(v0, v1);
+    v2 = _mm256_packus_epi32(v2, v3);
+
+    v0 = highbd_clamp_epi32(v0, bd);
+    v2 = highbd_clamp_epi32(v2, bd);
+
+    _mm256_storeu_si256((__m256i *)output, v0);
+    _mm256_storeu_si256((__m256i *)(output + 16), v2);
+    output += stride;
+    i += 4;
+  }
+}
+
+static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
+                                    __m256i n1, __m256i rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(w0, n0);
+  y = _mm256_mullo_epi32(w1, n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i bf1[32], bf0[32];
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0 * 4 + col];
+    bf1[1] = in[16 * 4 + col];
+    bf1[2] = in[8 * 4 + col];
+    bf1[3] = in[24 * 4 + col];
+    bf1[4] = in[4 * 4 + col];
+    bf1[5] = in[20 * 4 + col];
+    bf1[6] = in[12 * 4 + col];
+    bf1[7] = in[28 * 4 + col];
+    bf1[8] = in[2 * 4 + col];
+    bf1[9] = in[18 * 4 + col];
+    bf1[10] = in[10 * 4 + col];
+    bf1[11] = in[26 * 4 + col];
+    bf1[12] = in[6 * 4 + col];
+    bf1[13] = in[22 * 4 + col];
+    bf1[14] = in[14 * 4 + col];
+    bf1[15] = in[30 * 4 + col];
+    bf1[16] = in[1 * 4 + col];
+    bf1[17] = in[17 * 4 + col];
+    bf1[18] = in[9 * 4 + col];
+    bf1[19] = in[25 * 4 + col];
+    bf1[20] = in[5 * 4 + col];
+    bf1[21] = in[21 * 4 + col];
+    bf1[22] = in[13 * 4 + col];
+    bf1[23] = in[29 * 4 + col];
+    bf1[24] = in[3 * 4 + col];
+    bf1[25] = in[19 * 4 + col];
+    bf1[26] = in[11 * 4 + col];
+    bf1[27] = in[27 * 4 + col];
+    bf1[28] = in[7 * 4 + col];
+    bf1[29] = in[23 * 4 + col];
+    bf1[30] = in[15 * 4 + col];
+    bf1[31] = in[31 * 4 + col];
+
+    // stage 2
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = bf1[4];
+    bf0[5] = bf1[5];
+    bf0[6] = bf1[6];
+    bf0[7] = bf1[7];
+    bf0[8] = bf1[8];
+    bf0[9] = bf1[9];
+    bf0[10] = bf1[10];
+    bf0[11] = bf1[11];
+    bf0[12] = bf1[12];
+    bf0[13] = bf1[13];
+    bf0[14] = bf1[14];
+    bf0[15] = bf1[15];
+    bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
+    bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
+    bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
+    bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
+    bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
+    bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
+    bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
+    bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
+    bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
+    bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
+    bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
+    bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
+    bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
+    bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+
+    // stage 3
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = bf0[2];
+    bf1[3] = bf0[3];
+    bf1[4] = bf0[4];
+    bf1[5] = bf0[5];
+    bf1[6] = bf0[6];
+    bf1[7] = bf0[7];
+    bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
+    bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
+    bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
+    bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
+    bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
+    bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
+    bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
+    bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
+    bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
+    bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
+    bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
+    bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
+    bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
+    bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
+    bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
+    bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
+    bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
+    bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
+    bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
+    bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
+    bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
+    bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+    // stage 4
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
+    bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
+    bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
+    bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+    bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
+    bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
+    bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
+    bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
+    bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
+    bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
+    bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
+    bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
+    bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+    bf0[19] = bf1[19];
+    bf0[20] = bf1[20];
+    bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+    bf0[22] =
+        half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+    bf0[27] = bf1[27];
+    bf0[28] = bf1[28];
+    bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
+    bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+    bf0[31] = bf1[31];
+
+    // stage 5
+    bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
+    bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
+    bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
+    bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+    bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
+    bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
+    bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
+    bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+    bf1[8] = bf0[8];
+    bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+    bf1[10] =
+        half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+    bf1[11] = bf0[11];
+    bf1[12] = bf0[12];
+    bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
+    bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+    bf1[15] = bf0[15];
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
+    bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
+    bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
+    bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
+    bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
+    bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
+    bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
+    bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
+    bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
+    bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
+    bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
+    bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
+    bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
+    bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
+    bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+
+    // stage 6
+    bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
+    bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
+    bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
+    bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+    bf0[4] = bf1[4];
+    bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
+    bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+    bf0[7] = bf1[7];
+    bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
+    bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
+    bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
+    bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
+    bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
+    bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
+    bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
+    bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
+    bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+    bf0[20] =
+        half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+    bf0[21] =
+        half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+    bf0[22] = bf1[22];
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = bf1[25];
+    bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
+    bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
+    bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 7
+    bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
+    bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
+    bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
+    bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
+    bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
+    bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
+    bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
+    bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+    bf1[8] = bf0[8];
+    bf1[9] = bf0[9];
+    bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
+    bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
+    bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
+    bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+    bf1[14] = bf0[14];
+    bf1[15] = bf0[15];
+    bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
+    bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
+    bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
+    bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
+    bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
+    bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
+    bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
+    bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
+    bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
+    bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
+    bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
+    bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
+    bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
+    bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
+    bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
+    bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+
+    // stage 8
+    bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
+    bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
+    bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
+    bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
+    bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
+    bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
+    bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
+    bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
+    bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
+    bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
+    bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
+    bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
+    bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
+    bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
+    bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
+    bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = bf1[18];
+    bf0[19] = bf1[19];
+    bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
+    bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
+    bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
+    bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
+    bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
+    bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
+    bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
+    bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+    bf0[28] = bf1[28];
+    bf0[29] = bf1[29];
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 9
+    out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
+    out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
+    out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
+    out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
+    out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
+    out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
+    out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
+    out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
+    out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
+    out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
+    out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
+    out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
+    out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
+    out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
+    out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
+    out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
+    out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
+    out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
+    out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
+    out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
+    out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
+    out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
+    out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
+    out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
+    out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
+    out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
+    out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
+    out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
+    out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
+    out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
+    out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
+    out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+  }
+}
+
+void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m256i in[128], out[128];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_32;
+      load_buffer_32x32(coeff, in);
+      transpose_32x32(in, out);
+      idct32_avx2(out, in, cfg->cos_bit_row[2]);
+      round_shift_32x32(in, -cfg->shift[0]);
+      transpose_32x32(in, out);
+      idct32_avx2(out, in, cfg->cos_bit_col[2]);
+      write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 000000000..24b2760b9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,1398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  in[0] = _mm_add_epi32(v0, v3);
+  in[1] = _mm_add_epi32(v1, v2);
+  in[2] = _mm_sub_epi32(v1, v2);
+  in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  // stage 0
+  // stage 1
+  u1 = _mm_sub_epi32(zero, u1);
+  u3 = _mm_sub_epi32(zero, u3);
+
+  // stage 2
+  v0 = u0;
+  v1 = u3;
+  x = _mm_mullo_epi32(u1, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  v3 = _mm_sub_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(v0, v2);
+  u1 = _mm_add_epi32(v1, v3);
+  u2 = _mm_sub_epi32(v0, v2);
+  u3 = _mm_sub_epi32(v1, v3);
+
+  // stage 4
+  x = _mm_mullo_epi32(u0, cospi8);
+  y = _mm_mullo_epi32(u1, cospi56);
+  in[3] = _mm_add_epi32(x, y);
+  in[3] = _mm_add_epi32(in[3], rnding);
+  in[3] = _mm_srai_epi32(in[3], bit);
+
+  x = _mm_mullo_epi32(u0, cospi56);
+  y = _mm_mullo_epi32(u1, cospim8);
+  in[0] = _mm_add_epi32(x, y);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  x = _mm_mullo_epi32(u2, cospi40);
+  y = _mm_mullo_epi32(u3, cospi24);
+  in[1] = _mm_add_epi32(x, y);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[1] = _mm_srai_epi32(in[1], bit);
+
+  x = _mm_mullo_epi32(u2, cospi24);
+  y = _mm_mullo_epi32(u3, cospim40);
+  in[2] = _mm_add_epi32(x, y);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = _mm_add_epi32(u4, u5);
+    v5 = _mm_sub_epi32(u4, u5);
+    v6 = _mm_sub_epi32(u7, u6);
+    v7 = _mm_add_epi32(u6, u7);
+
+    // stage 4
+    u0 = _mm_add_epi32(v0, v3);
+    u1 = _mm_add_epi32(v1, v2);
+    u2 = _mm_sub_epi32(v1, v2);
+    u3 = _mm_sub_epi32(v0, v3);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+                             int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+  int i;
+  for (i = 0; i < 64; ++i) {
+    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+  }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+                                        int col) {
+  int i;
+  for (i = 0; i < 16; i += 2) {
+    in8x8[i] = in[col];
+    in8x8[i + 1] = in[col + 1];
+    col += 4;
+  }
+}
+
+static void swap_addr(uint16_t **output1, uint16_t **output2) {
+  uint16_t *tmp;
+  tmp = *output1;
+  *output1 = *output2;
+  *output2 = tmp;
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in8x8[16];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[8];
+  uint16_t *leftDown = &output[8 * stride];
+  uint16_t *rightDown = &output[8 * stride + 8];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 0);
+  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 2);
+  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 32);
+  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 34);
+  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = in[8 * 4 + col];
+    u[2] = in[4 * 4 + col];
+    u[3] = in[12 * 4 + col];
+    u[4] = in[2 * 4 + col];
+    u[5] = in[10 * 4 + col];
+    u[6] = in[6 * 4 + col];
+    u[7] = in[14 * 4 + col];
+    u[8] = in[1 * 4 + col];
+    u[9] = in[9 * 4 + col];
+    u[10] = in[5 * 4 + col];
+    u[11] = in[13 * 4 + col];
+    u[12] = in[3 * 4 + col];
+    u[13] = in[11 * 4 + col];
+    u[14] = in[7 * 4 + col];
+    u[15] = in[15 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[10], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[14], v[15]);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[6], u[7]);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[13], v[14]);
+    u[15] = _mm_add_epi32(v[12], v[15]);
+
+    // stage 6
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+    // stage 9
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
+  }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+  round_shift_8x8(&in[0], shift);
+  round_shift_8x8(&in[16], shift);
+  round_shift_8x8(&in[32], shift);
+  round_shift_8x8(&in[48], shift);
+}
+
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+                                     int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif
+    default: assert(0);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 000000000..bc96defe3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                                \
+    __m128i u0, u1, u2, u3;                           \
+    u0 = _mm_unpacklo_epi32(x0, x1);                  \
+    u1 = _mm_unpackhi_epi32(x0, x1);                  \
+    u2 = _mm_unpacklo_epi32(x2, x3);                  \
+    u3 = _mm_unpackhi_epi32(x2, x3);                  \
+    y0 = _mm_unpacklo_epi64(u0, u2);                  \
+    y1 = _mm_unpackhi_epi64(u0, u2);                  \
+    y2 = _mm_unpacklo_epi64(u1, u3);                  \
+    y3 = _mm_unpackhi_epi64(u1, u3);                  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
+                                      __m128i n1, __m128i rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(w0, n0);
+  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
new file mode 100644
index 000000000..c25db88b7
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
+                                  int height, int stride, uint16_t *pred,
+                                  int p_col, int p_row, int p_width,
+                                  int p_height, int p_stride, int subsampling_x,
+                                  int subsampling_y, int bd, int ref_frm,
+                                  int16_t alpha, int16_t beta, int16_t gamma,
+                                  int16_t delta) {
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+  __m128i tmp[15];
+#else
+#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
+#endif
+  int i, j, k;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      // (x, y) coordinates of the center of this block in the destination
+      // image
+      int32_t dst_x = p_col + j + 4;
+      int32_t dst_y = p_row + i + 4;
+
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        // If the block is aligned such that, after clamping, every sample
+        // would be taken from the leftmost/rightmost column, then we can
+        // skip the expensive horizontal filter.
+        if (ix4 <= -7) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else if (ix4 >= width + 6) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride + (width - 1)] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k +
+                   // Include rounding and offset here
+                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          // Filter even-index pixels
+          __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+          __m128i round_const =
+              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+          // Calculate filtered results
+          __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+          __m128i res_2 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+          __m128i res_4 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+          __m128i res_6 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                    HORSHEAR_REDUCE_PREC_BITS);
+
+          // Filter odd-index pixels
+          __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
+
+          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          __m128i res_1 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+          __m128i res_3 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+          __m128i res_5 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+          __m128i res_7 =
+              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                   HORSHEAR_REDUCE_PREC_BITS);
+
+          // Combine results into one register.
+          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+          // as this order helps with the vertical filter.
+          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k +
+                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        __m128i *src = tmp + (k + 4);
+        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                         _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
+
+        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                        _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        // Round and pack into 8 bits
+        __m128i round_const =
+            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+        __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+        __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        // Clamp res_16bit to the range [0, 2^bd - 1]
+        __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+        __m128i zero = _mm_setzero_si128();
+        res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+        // Store, blending with 'pred' if needed
+        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+        // Note: If we're outputting a 4x4 block, we need to be very careful
+        // to only output 4 pixels at this point, to avoid encode/decode
+        // mismatches when encoding with multiple threads.
+        if (p_width == 4) {
+          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+          _mm_storel_epi64(p, res_16bit);
+        } else {
+          if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+          _mm_storeu_si128(p, res_16bit);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 000000000..efc8d1e24
--- /dev/null
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_HIGHBITDEPTH
+  *in = _mm256_setr_epi16(
+      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+      (int16_t)coeff[15]);
+#else
+  *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+  int i = 0;
+  while (i < 16) {
+    load_coeff(coeff + (i << 4), &in[i]);
+    i += 1;
+  }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = _mm_loadu_si128((__m128i const *)output);
+  __m128i p0 = _mm_unpacklo_epi8(x, zero);
+  __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+  x = _mm_packus_epi16(p0, p1);
+  _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+    recon_and_store(&in[i], output + i * stride);
+    i += 1;
+  }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+                                     const __m256i *c0, const __m256i *c1,
+                                     __m256i *b0, __m256i *b1) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  *b0 = butter_fly(x0, x1, *c0);
+  *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  // stage 1, (0-7)
+  u0 = in[0];
+  u1 = in[8];
+  u2 = in[4];
+  u3 = in[12];
+  u4 = in[2];
+  u5 = in[10];
+  u6 = in[6];
+  u7 = in[14];
+
+  // stage 2, (0-7)
+  // stage 3, (0-7)
+  t0 = u0;
+  t1 = u1;
+  t2 = u2;
+  t3 = u3;
+  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+  // stage 4, (0-7)
+  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+  u4 = _mm256_add_epi16(t4, t5);
+  u5 = _mm256_sub_epi16(t4, t5);
+  u6 = _mm256_sub_epi16(t7, t6);
+  u7 = _mm256_add_epi16(t7, t6);
+
+  // stage 5, (0-7)
+  t0 = _mm256_add_epi16(u0, u3);
+  t1 = _mm256_add_epi16(u1, u2);
+  t2 = _mm256_sub_epi16(u1, u2);
+  t3 = _mm256_sub_epi16(u0, u3);
+  t4 = u4;
+  t7 = u7;
+  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+  // stage 6, (0-7)
+  u0 = _mm256_add_epi16(t0, t7);
+  u1 = _mm256_add_epi16(t1, t6);
+  u2 = _mm256_add_epi16(t2, t5);
+  u3 = _mm256_add_epi16(t3, t4);
+  u4 = _mm256_sub_epi16(t3, t4);
+  u5 = _mm256_sub_epi16(t2, t5);
+  u6 = _mm256_sub_epi16(t1, t6);
+  u7 = _mm256_sub_epi16(t0, t7);
+
+  // stage 1, (8-15)
+  v0 = in[1];
+  v1 = in[9];
+  v2 = in[5];
+  v3 = in[13];
+  v4 = in[3];
+  v5 = in[11];
+  v6 = in[7];
+  v7 = in[15];
+
+  // stage 2, (8-15)
+  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+  // stage 3, (8-15)
+  v0 = _mm256_add_epi16(t0, t1);
+  v1 = _mm256_sub_epi16(t0, t1);
+  v2 = _mm256_sub_epi16(t3, t2);
+  v3 = _mm256_add_epi16(t2, t3);
+  v4 = _mm256_add_epi16(t4, t5);
+  v5 = _mm256_sub_epi16(t4, t5);
+  v6 = _mm256_sub_epi16(t7, t6);
+  v7 = _mm256_add_epi16(t6, t7);
+
+  // stage 4, (8-15)
+  t0 = v0;
+  t7 = v7;
+  t3 = v3;
+  t4 = v4;
+  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+  // stage 5, (8-15)
+  v0 = _mm256_add_epi16(t0, t3);
+  v1 = _mm256_add_epi16(t1, t2);
+  v2 = _mm256_sub_epi16(t1, t2);
+  v3 = _mm256_sub_epi16(t0, t3);
+  v4 = _mm256_sub_epi16(t7, t4);
+  v5 = _mm256_sub_epi16(t6, t5);
+  v6 = _mm256_add_epi16(t6, t5);
+  v7 = _mm256_add_epi16(t7, t4);
+
+  // stage 6, (8-15)
+  t0 = v0;
+  t1 = v1;
+  t6 = v6;
+  t7 = v7;
+  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+  // stage 7
+  in[0] = _mm256_add_epi16(u0, t7);
+  in[1] = _mm256_add_epi16(u1, t6);
+  in[2] = _mm256_add_epi16(u2, t5);
+  in[3] = _mm256_add_epi16(u3, t4);
+  in[4] = _mm256_add_epi16(u4, t3);
+  in[5] = _mm256_add_epi16(u5, t2);
+  in[6] = _mm256_add_epi16(u6, t1);
+  in[7] = _mm256_add_epi16(u7, t0);
+  in[8] = _mm256_sub_epi16(u7, t0);
+  in[9] = _mm256_sub_epi16(u6, t1);
+  in[10] = _mm256_sub_epi16(u5, t2);
+  in[11] = _mm256_sub_epi16(u4, t3);
+  in[12] = _mm256_sub_epi16(u3, t4);
+  in[13] = _mm256_sub_epi16(u2, t5);
+  in[14] = _mm256_sub_epi16(u1, t6);
+  in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+                                 const __m256i *c0, const __m256i *c1,
+                                 __m256i *b) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  b[0] = _mm256_madd_epi16(x0, *c0);
+  b[1] = _mm256_madd_epi16(x1, *c0);
+  b[2] = _mm256_madd_epi16(x0, *c1);
+  b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  int i;
+  for (i = 0; i < num; ++i) {
+    a[i] = _mm256_add_epi32(a[i], dct_rounding);
+    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+  }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_add_epi32(a[0], b[0]);
+  x[1] = _mm256_add_epi32(a[1], b[1]);
+  x[2] = _mm256_add_epi32(a[2], b[2]);
+  x[3] = _mm256_add_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_sub_epi32(a[0], b[0]);
+  x[1] = _mm256_sub_epi32(a[1], b[1]);
+  x[2] = _mm256_sub_epi32(a[2], b[2]);
+  x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+  group_rounding(a, 4);
+  out[0] = _mm256_packs_epi32(a[0], a[1]);
+  out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i x[16], s[16];
+  __m256i u[4], v[4];
+
+  // stage 1
+  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+  add_rnd(u, v, &x[0]);
+  sub_rnd(u, v, &x[8]);
+
+  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+  add_rnd(u, v, &x[2]);
+  sub_rnd(u, v, &x[10]);
+
+  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[12]);
+
+  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+  add_rnd(u, v, &x[6]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 2
+  s[0] = _mm256_add_epi16(x[0], x[4]);
+  s[1] = _mm256_add_epi16(x[1], x[5]);
+  s[2] = _mm256_add_epi16(x[2], x[6]);
+  s[3] = _mm256_add_epi16(x[3], x[7]);
+  s[4] = _mm256_sub_epi16(x[0], x[4]);
+  s[5] = _mm256_sub_epi16(x[1], x[5]);
+  s[6] = _mm256_sub_epi16(x[2], x[6]);
+  s[7] = _mm256_sub_epi16(x[3], x[7]);
+  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+  add_rnd(u, v, &s[8]);
+  sub_rnd(u, v, &s[12]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+  add_rnd(u, v, &s[10]);
+  sub_rnd(u, v, &s[14]);
+
+  // stage 3
+  x[0] = _mm256_add_epi16(s[0], s[2]);
+  x[1] = _mm256_add_epi16(s[1], s[3]);
+  x[2] = _mm256_sub_epi16(s[0], s[2]);
+  x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+  x[8] = _mm256_add_epi16(s[8], s[10]);
+  x[9] = _mm256_add_epi16(s[9], s[11]);
+  x[10] = _mm256_sub_epi16(s[8], s[10]);
+  x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[6]);
+
+  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[12]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 4
+  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+  butterfly_rnd(u, &x[2]);
+  butterfly_rnd(v, &x[6]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+  butterfly_rnd(u, &x[10]);
+  butterfly_rnd(v, &x[14]);
+
+  in[0] = x[0];
+  in[1] = _mm256_sub_epi16(zero, x[8]);
+  in[2] = x[12];
+  in[3] = _mm256_sub_epi16(zero, x[4]);
+  in[4] = x[6];
+  in[5] = x[14];
+  in[6] = x[10];
+  in[7] = x[2];
+  in[8] = x[3];
+  in[9] = x[11];
+  in[10] = x[15];
+  in[11] = x[7];
+  in[12] = x[5];
+  in[13] = _mm256_sub_epi16(zero, x[13]);
+  in[14] = x[9];
+  in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+  int i;
+  for (i = 0; i < rows; ++i) {
+    mm256_reverse_epi16(&in[i]);
+  }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+  *dest = *dest + (rows - 1) * (*stride);
+  *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m256i in[16];
+
+  load_buffer_16x16(input, in);
+  switch (tx_type) {
+    case DCT_DCT:
+      idct16(in);
+      idct16(in);
+      break;
+    case ADST_DCT:
+      idct16(in);
+      iadst16(in);
+      break;
+    case DCT_ADST:
+      iadst16(in);
+      idct16(in);
+      break;
+    case ADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16(in);
+      idct16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      flip_col(&dest, &stride, 16);
+      break;
+    case ADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case IDTX:
+      iidtx16(in);
+      iidtx16(in);
+      break;
+    case V_DCT:
+      iidtx16(in);
+      idct16(in);
+      break;
+    case H_DCT:
+      idct16(in);
+      iidtx16(in);
+      break;
+    case V_ADST:
+      iidtx16(in);
+      iadst16(in);
+      break;
+    case H_ADST:
+      iadst16(in);
+      iidtx16(in);
+      break;
+    case V_FLIPADST:
+      iidtx16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case H_FLIPADST:
+      iadst16(in);
+      iidtx16(in);
+      flip_row(in, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_16x16(in, stride, dest);
+}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
new file mode 100644
index 000000000..522e8988c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -0,0 +1,1402 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#if CONFIG_EXT_TX
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) \
+  do {                         \
+    __m128i *tmp;              \
+    fliplr_16x8(in0);          \
+    fliplr_16x8(in1);          \
+    tmp = (in0);               \
+    (in0) = (in1);             \
+    (in1) = tmp;               \
+  } while (0)
+
+#define FLIPUD_PTR(dest, stride, size)       \
+  do {                                       \
+    (dest) = (dest) + ((size)-1) * (stride); \
+    (stride) = -(stride);                    \
+  } while (0)
+#endif
+
+void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[2];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct4_sse2(in);
+      aom_idct4_sse2(in);
+      break;
+    case ADST_DCT:
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      break;
+    case DCT_ADST:
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      break;
+    case ADST_ADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  // load input data
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in);
+      break;
+    case ADST_DCT:
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      break;
+    case DCT_ADST:
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      break;
+    case ADST_ADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idtx16_8col(in0);
+  idtx16_8col(in1);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case DCT_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_DCT:
+      iidtx16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case H_DCT:
+      aom_idct16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_ADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case H_ADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_FLIPADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case H_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
+static void iidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+}
+
+static INLINE void iidtx4_sse2(__m128i *in) {
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+}
+
+// load 8x8 array
+static INLINE void flip_buffer_lr_8x8(__m128i *in) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+#endif  // CONFIG_EXT_TX
+
+void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+  in[4] = load_input_data(input + 4 * 8);
+  in[5] = load_input_data(input + 5 * 8);
+  in[6] = load_input_data(input + 6 * 8);
+  in[7] = load_input_data(input + 7 * 8);
+
+  in[8] = load_input_data(input + 8 * 8);
+  in[9] = load_input_data(input + 9 * 8);
+  in[10] = load_input_data(input + 10 * 8);
+  in[11] = load_input_data(input + 11 * 8);
+  in[12] = load_input_data(input + 12 * 8);
+  in[13] = load_input_data(input + 13 * 8);
+  in[14] = load_input_data(input + 14 * 8);
+  in[15] = load_input_data(input + 15 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_idct8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_iadst8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: idtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case H_DCT:
+#endif
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x16(dest, in, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest + stride * 15, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  // Transpose 16x8 input into in[]
+  in[0] = load_input_data(input + 0 * 16);
+  in[1] = load_input_data(input + 1 * 16);
+  in[2] = load_input_data(input + 2 * 16);
+  in[3] = load_input_data(input + 3 * 16);
+  in[4] = load_input_data(input + 4 * 16);
+  in[5] = load_input_data(input + 5 * 16);
+  in[6] = load_input_data(input + 6 * 16);
+  in[7] = load_input_data(input + 7 * 16);
+  array_transpose_8x8(in, in);
+
+  in[8] = load_input_data(input + 8 + 0 * 16);
+  in[9] = load_input_data(input + 8 + 1 * 16);
+  in[10] = load_input_data(input + 8 + 2 * 16);
+  in[11] = load_input_data(input + 8 + 3 * 16);
+  in[12] = load_input_data(input + 8 + 4 * 16);
+  in[13] = load_input_data(input + 8 + 5 * 16);
+  in[14] = load_input_data(input + 8 + 6 * 16);
+  in[15] = load_input_data(input + 8 + 7 * 16);
+  array_transpose_8x8(in + 8, in + 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: idtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Scale
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in + 8);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in + 8, in + 8);
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x8_round6(dest, in, stride);
+      write_buffer_8x8_round6(dest + 8, in + 8, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      write_buffer_8x8_round6(dest + stride * 7, in, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest, in + 8, stride);
+      write_buffer_8x8_round6(dest + 8, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+}
+
+void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST: aom_iadst8_sse2(in); break;
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST: aom_iadst8_sse2(in); break;
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
+      break;
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x8(in);
+
+  // Repack data. We pack into the bottom half of 'in'
+  // so that the next repacking stage can pack into the
+  // top half without overwriting anything
+  in[7] = _mm_unpacklo_epi64(in[6], in[7]);
+  in[6] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[5] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[4] = _mm_unpacklo_epi64(in[0], in[1]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      FLIPUD_PTR(dest, stride, 4);
+#endif
+      break;
+    default: assert(0); break;
+  }
+  write_buffer_8x4_round5(dest, in, stride);
+}
+
+static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
+    __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
+    __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
+    __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
+
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
+    d4 = _mm_unpacklo_epi32(d4, d5);
+    d6 = _mm_unpacklo_epi32(d6, d7);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d4 = _mm_unpacklo_epi8(d4, zero);
+    d6 = _mm_unpacklo_epi8(d6, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d4 = _mm_add_epi16(d4, in[2]);
+    d6 = _mm_add_epi16(d6, in[3]);
+
+    d0 = _mm_packus_epi16(d0, d2);
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_packus_epi16(d4, d6);
+    *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  // Load rows, packed two per element of 'in'.
+  // We pack into the bottom half of 'in' so that the
+  // later repacking stage can pack into the
+  // top half without overwriting anything
+  in[4] = load_input_data(input + 0 * 8);
+  in[5] = load_input_data(input + 1 * 8);
+  in[6] = load_input_data(input + 2 * 8);
+  in[7] = load_input_data(input + 3 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x4(in + 4);
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx8_sse2(in);
+      array_transpose_8x8(in, in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  in[0] = _mm_unpacklo_epi64(in[0], in[1]);
+  in[1] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[2] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[3] = _mm_unpacklo_epi64(in[6], in[7]);
+  write_buffer_4x8_round5(dest, in, stride);
+}
+
+// Note: The 16-column 32-element transforms take input in the form of four
+// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
+// of the overall 16x32 input buffer.
+static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                __m128i *br) {
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  idct32_8col(tl, bl);
+  idct32_8col(tr, br);
+}
+
+static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                      __m128i *br) {
+  __m128i tmpl[16], tmpr[16];
+  int i;
+
+  // Copy the top half of the input to temporary storage
+  for (i = 0; i < 16; ++i) {
+    tmpl[i] = tl[i];
+    tmpr[i] = tr[i];
+  }
+
+  // Generate the top half of the output
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(bl[i], 2);
+    tr[i] = _mm_slli_epi16(br[i], 2);
+  }
+  array_transpose_16x16(tl, tr);
+
+  // Copy the temporary storage back to the bottom half of the input
+  for (i = 0; i < 16; ++i) {
+    bl[i] = tmpl[i];
+    br[i] = tmpr[i];
+  }
+
+  // Generate the bottom half of the output
+  scale_sqrt2_8x16(bl);
+  scale_sqrt2_8x16(br);
+  aom_idct16_sse2(bl, br);  // Includes a transposition
+}
+
+#if CONFIG_EXT_TX
+static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  int i;
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(tl[i], 2);
+    tr[i] = _mm_slli_epi16(tr[i], 2);
+    bl[i] = _mm_slli_epi16(bl[i], 2);
+    br[i] = _mm_slli_epi16(br[i], 2);
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
+                                             __m128i *intr, __m128i *inbl,
+                                             __m128i *inbr, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    intl[i] = _mm_adds_epi16(intl[i], final_rounding);
+    intr[i] = _mm_adds_epi16(intr[i], final_rounding);
+    inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
+    inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
+    intl[i] = _mm_srai_epi16(intl[i], 6);
+    intr[i] = _mm_srai_epi16(intr[i], 6);
+    inbl[i] = _mm_srai_epi16(inbl[i], 6);
+    inbr[i] = _mm_srai_epi16(inbr[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, intl[i]);
+    RECON_AND_STORE(dest + i * stride + 8, intr[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
+  }
+}
+
+void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    intl[i] = load_input_data(input + i * 16 + 0);
+    intr[i] = load_input_data(input + i * 16 + 8);
+    inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
+    inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct16_sse2(intl, intr);
+      aom_idct16_sse2(inbl, inbr);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst16_sse2(intl, intr);
+      aom_iadst16_sse2(inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx16_sse2(intl, intr);
+      iidtx16_sse2(inbl, inbr);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(intl);
+  scale_sqrt2_8x16(intr);
+  scale_sqrt2_8x16(inbl);
+  scale_sqrt2_8x16(inbr);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct32_16col(intl, intr, inbl, inbr);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      ihalfright32_16col(intl, intr, inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      FLIPUD_PTR(dest, stride, 32);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
+}
+
+static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
+                                             __m128i *in1, __m128i *in2,
+                                             __m128i *in3, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = _mm_adds_epi16(in0[i], final_rounding);
+    in1[i] = _mm_adds_epi16(in1[i], final_rounding);
+    in2[i] = _mm_adds_epi16(in2[i], final_rounding);
+    in3[i] = _mm_adds_epi16(in3[i], final_rounding);
+    in0[i] = _mm_srai_epi16(in0[i], 6);
+    in1[i] = _mm_srai_epi16(in1[i], 6);
+    in2[i] = _mm_srai_epi16(in2[i], 6);
+    in3[i] = _mm_srai_epi16(in3[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, in0[i]);
+    RECON_AND_STORE(dest + i * stride + 8, in1[i]);
+    RECON_AND_STORE(dest + i * stride + 16, in2[i]);
+    RECON_AND_STORE(dest + i * stride + 24, in3[i]);
+  }
+}
+
+void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in0[16], in1[16], in2[16], in3[16];
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = load_input_data(input + i * 32 + 0);
+    in1[i] = load_input_data(input + i * 32 + 8);
+    in2[i] = load_input_data(input + i * 32 + 16);
+    in3[i] = load_input_data(input + i * 32 + 24);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      ihalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(in0);
+  scale_sqrt2_8x16(in1);
+  scale_sqrt2_8x16(in2);
+  scale_sqrt2_8x16(in3);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in2, in3);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in2, in3);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
new file mode 100644
index 000000000..b3ed9efdf
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <float.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/pvq_sse4.h"
+#include "../odintrin.h"
+#include "av1/common/pvq.h"
+
+#define EPSILON 1e-15f
+
+static __m128 horizontal_sum_ps(__m128 x) {
+  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
+  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
+  return x;
+}
+
+static __m128i horizontal_sum_epi32(__m128i x) {
+  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
+  return x;
+}
+
+static INLINE float rsqrtf(float x) {
+  float y;
+  _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
+  return y;
+}
+
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. This is a float-precision PVQ search just to make sure
+ * our tests aren't limited by numerical accuracy. It's close to the
+ * pvq_search_rdo_double_c implementation, but is not bit accurate and
+ * it performs slightly worse on PSNR. One reason is that this code runs
+ * more RDO iterations than the C code. It also uses single precision
+ * floating point math, whereas the C version uses double precision.
+ *
+ * @param [in]      xcoeff  input vector to quantize (x in the math doc)
+ * @param [in]      n       number of dimensions
+ * @param [in]      k       number of pulses
+ * @param [out]     ypulse  optimal codevector found (y in the math doc)
+ * @param [in]      g2      multiplier for the distortion (typically squared
+ *                          gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]      prev_k  number of pulses already in ypulse that we should
+ *                          reuse for the search (or 0 for a new search)
+ * @return                  cosine distance between x and y (between 0 and 1)
+ */
+double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
+                                    int *ypulse, double g2,
+                                    double pvq_norm_lambda, int prev_k) {
+  int i, j;
+  int reuse_pulses = prev_k > 0 && prev_k <= k;
+  /* TODO - This blows our 8kB stack space budget and should be fixed when
+   converting PVQ to fixed point. */
+  float xx = 0, xy = 0, yy = 0;
+  float x[MAXN + 3];
+  float y[MAXN + 3];
+  float sign_y[MAXN + 3];
+  for (i = 0; i < n; i++) {
+    float tmp = (float)xcoeff[i];
+    xx += tmp * tmp;
+    x[i] = xcoeff[i];
+  }
+
+  x[n] = x[n + 1] = x[n + 2] = 0;
+  ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
+
+  __m128 sums = _mm_setzero_ps();
+  for (i = 0; i < n; i += 4) {
+    __m128 x4 = _mm_loadu_ps(&x[i]);
+    __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+    /* Save the sign, we'll put it back later. */
+    _mm_storeu_ps(&sign_y[i], s4);
+    /* Get rid of the sign. */
+    x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
+    sums = _mm_add_ps(sums, x4);
+    if (!reuse_pulses) {
+      /* Clear y and ypulse in case we don't do the projection. */
+      _mm_storeu_ps(&y[i], _mm_setzero_ps());
+      _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
+    }
+    _mm_storeu_ps(&x[i], x4);
+  }
+  sums = horizontal_sum_ps(sums);
+  int pulses_left = k;
+  {
+    __m128i pulses_sum;
+    __m128 yy4, xy4;
+    xy4 = yy4 = _mm_setzero_ps();
+    pulses_sum = _mm_setzero_si128();
+    if (reuse_pulses) {
+      /* We reuse pulses from a previous search so we don't have to search them
+          again. */
+      for (j = 0; j < n; j += 4) {
+        __m128 x4, y4;
+        __m128i iy4;
+        iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
+        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+        y4 = _mm_cvtepi32_ps(iy4);
+        x4 = _mm_loadu_ps(&x[j]);
+        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+        /* Double the y[] vector so we don't have to do it in the search loop.
+         */
+        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+      }
+      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+      xy4 = horizontal_sum_ps(xy4);
+      xy = _mm_cvtss_f32(xy4);
+      yy4 = horizontal_sum_ps(yy4);
+      yy = _mm_cvtss_f32(yy4);
+    } else if (k > (n >> 1)) {
+      /* Do a pre-search by projecting on the pyramid. */
+      __m128 rcp4;
+      float sum = _mm_cvtss_f32(sums);
+      /* If x is too small, just replace it with a pulse at 0. This prevents
+         infinities and NaNs from causing too many pulses to be allocated. Here,
+         64 is an
+         approximation of infinity. */
+      if (sum <= EPSILON) {
+        x[0] = 1.f;
+        for (i = 1; i < n; i++) {
+          x[i] = 0;
+        }
+        sums = _mm_set_ps1(1.f);
+      }
+      /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
+      rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
+      xy4 = yy4 = _mm_setzero_ps();
+      pulses_sum = _mm_setzero_si128();
+      for (j = 0; j < n; j += 4) {
+        __m128 rx4, x4, y4;
+        __m128i iy4;
+        x4 = _mm_loadu_ps(&x[j]);
+        rx4 = _mm_mul_ps(x4, rcp4);
+        iy4 = _mm_cvttps_epi32(rx4);
+        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
+        y4 = _mm_cvtepi32_ps(iy4);
+        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+        /* Double the y[] vector so we don't have to do it in the search loop.
+         */
+        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+      }
+      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
+      xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
+      yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
+    }
+    x[n] = x[n + 1] = x[n + 2] = -100;
+    y[n] = y[n + 1] = y[n + 2] = 100;
+  }
+
+  /* This should never happen. */
+  OD_ASSERT(pulses_left <= n + 3);
+
+  float lambda_delta_rate[MAXN + 3];
+  if (pulses_left) {
+    /* Hoist lambda to avoid the multiply in the loop. */
+    float lambda =
+        0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
+    float delta_rate = 3.f / n;
+    __m128 count = _mm_set_ps(3, 2, 1, 0);
+    for (i = 0; i < n; i += 4) {
+      _mm_storeu_ps(&lambda_delta_rate[i],
+                    _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
+      count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
+    }
+  }
+  lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
+      1e30f;
+
+  for (i = 0; i < pulses_left; i++) {
+    int best_id = 0;
+    __m128 xy4, yy4;
+    __m128 max, max2;
+    __m128i count;
+    __m128i pos;
+
+    /* The squared magnitude term gets added anyway, so we might as well
+        add it outside the loop. */
+    yy = yy + 1;
+    xy4 = _mm_load1_ps(&xy);
+    yy4 = _mm_load1_ps(&yy);
+    max = _mm_setzero_ps();
+    pos = _mm_setzero_si128();
+    count = _mm_set_epi32(3, 2, 1, 0);
+    for (j = 0; j < n; j += 4) {
+      __m128 x4, y4, r4;
+      x4 = _mm_loadu_ps(&x[j]);
+      y4 = _mm_loadu_ps(&y[j]);
+      x4 = _mm_add_ps(x4, xy4);
+      y4 = _mm_add_ps(y4, yy4);
+      y4 = _mm_rsqrt_ps(y4);
+      r4 = _mm_mul_ps(x4, y4);
+      /* Subtract lambda. */
+      r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
+      /* Update the index of the max. */
+      pos = _mm_max_epi16(
+          pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
+      /* Update the max. */
+      max = _mm_max_ps(max, r4);
+      /* Update the indices (+4) */
+      count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
+    }
+    /* Horizontal max. */
+    max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
+    max2 =
+        _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
+    /* Now that max2 contains the max at all positions, look at which value(s)
+       of the
+        partial max is equal to the global max. */
+    pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
+    pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
+    pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
+    best_id = _mm_cvtsi128_si32(pos);
+    OD_ASSERT(best_id < n);
+    /* Updating the sums of the new pulse(s) */
+    xy = xy + x[best_id];
+    /* We're multiplying y[j] by two so we don't have to do it here. */
+    yy = yy + y[best_id];
+    /* Only now that we've made the final choice, update y/ypulse. */
+    /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
+    y[best_id] += 2;
+    ypulse[best_id]++;
+  }
+
+  /* Put the original sign back. */
+  for (i = 0; i < n; i += 4) {
+    __m128i y4;
+    __m128i s4;
+    y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
+    s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
+    y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+    _mm_storeu_si128((__m128i *)&ypulse[i], y4);
+  }
+  return xy * rsqrtf(xx * yy + FLT_MIN);
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
new file mode 100644
index 000000000..3c4ce8543
--- /dev/null
+++ b/third_party/aom/av1/common/x86/pvq_sse4.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
+#define AOM_COMMON_PVQ_X86_SSE4_H_
+#endif  // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 000000000..260faa8c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,1805 @@
+#include <smmintrin.h>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "av1/common/restoration.h"
+
+/* Calculate four consecutive entries of the intermediate A and B arrays
+   (corresponding to the first loop in the C version of
+   av1_selfguided_restoration)
+*/
+static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
+                       __m128i one_over_n, __m128i s, int bit_depth, int idx,
+                       int32_t *A, int32_t *B) {
+  __m128i a, b, p;
+#if CONFIG_HIGHBITDEPTH
+  if (bit_depth > 8) {
+    __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
+    __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
+    __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
+    __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
+    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
+    a = _mm_mullo_epi32(a, n);
+    b = _mm_mullo_epi32(b, b);
+    p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
+  } else {
+#endif
+    (void)bit_depth;
+    a = _mm_mullo_epi32(sum_sq, n);
+    b = _mm_mullo_epi32(sum, sum);
+    p = _mm_sub_epi32(a, b);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
+  __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
+                             SGRPROJ_MTABLE_BITS);
+  z = _mm_min_epi32(z, _mm_set1_epi32(255));
+
+  // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+  // gather using scalar loads.
+  __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+  _mm_storeu_si128((__m128i *)&A[idx], a_res);
+
+  __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
+  __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+  __m128i b_int =
+      _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
+  __m128i b_res =
+      _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
+
+  _mm_storeu_si128((__m128i *)&B[idx], b_res);
+}
+
+static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum
+  // When the width is not a multiple of 4, we know that 'stride' is rounded up
+  // to a multiple of 4. So it is safe for this loop to calculate extra columns
+  // at the right-hand edge of the frame.
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_madd_epi16(tmp, tmp);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 1; i < height - 2; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+      y = _mm_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+
+  // Horizontal sum
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(2, height - i) + AOMMIN(1, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    // Note: The _mm_slli_si128 call sets up a register containing
+    // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
+    // so that the first element of 'sum' (which should only add two values
+    // together) ends up calculated correctly.
+    __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
+                                 _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
+    __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
+                      one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    n = _mm_set1_epi32(3 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 3
+    a2 = _mm_alignr_epi8(a2, a1, 12);
+    b2 = _mm_alignr_epi8(b2, b1, 12);
+
+    // Note: When the width is not a multiple of 4, this loop may end up
+    // writing to the last 4 columns of the frame, potentially with incorrect
+    // values (especially for r=2 and r=3).
+    // This is fine, since we fix up those values in the block after this
+    // loop, and in exchange we never have more than four values to
+    // write / fix up after this loop finishes.
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      b1 = b2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
+         a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
+         and similar for b1,b2 and B
+      */
+      sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                             _mm_alignr_epi8(b2, b1, 8)));
+      sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                                _mm_alignr_epi8(a2, a1, 8)));
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xfc);
+    b2 = _mm_blend_epi16(b2, zero, 0xfc);
+
+    sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                           _mm_alignr_epi8(b2, b1, 8)));
+    sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                              _mm_alignr_epi8(a2, a1, 8)));
+    n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
+    one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
+                               one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, c2, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+    // Important: Since c may be up to 2^8, the result on squaring may
+    // be up to 2^16. So we need to zero-extend, not sign-extend.
+    c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 2; i < height - 3; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
+      y = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+
+  // Horizontal sum
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(3, height - i) + AOMMIN(2, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    __m128i sum_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
+            _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
+        _mm_alignr_epi8(b2, b1, 8));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
+            _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
+        _mm_alignr_epi8(a2, a1, 8));
+
+    __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
+                      one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 2
+    a2 = _mm_alignr_epi8(a2, a1, 8);
+    b2 = _mm_alignr_epi8(b2, b1, 8);
+
+    n = _mm_set1_epi32(5 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
+
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+      b1 = b2;
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
+         a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
+         and similar for b1,b2 and B
+      */
+      sum_ = _mm_add_epi32(
+          _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                          _mm_alignr_epi8(b2, b1, 8))),
+          _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+      sum_sq_ = _mm_add_epi32(
+          _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                          _mm_alignr_epi8(a2, a1, 8))),
+          _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    // If the width is not a multiple of 4, we need to reset j to width - 4
+    // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xf0);
+    b2 = _mm_blend_epi16(b2, zero, 0xf0);
+
+    sum_ = _mm_add_epi32(
+        _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                        _mm_alignr_epi8(b2, b1, 8))),
+        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
+    sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                        _mm_alignr_epi8(a2, a1, 8))),
+        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
+
+    n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
+    one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
+                               one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
+                                       int src_stride, int32_t *A, int32_t *B,
+                                       int buf_stride) {
+  int i, j;
+
+  // Vertical sum over 7-pixel regions, 4 columns at a time
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, d, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp, tmp2;
+
+    a = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[j]));
+    b = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[src_stride + j]));
+    c = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    d = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+
+    sum = _mm_cvtepi16_epi32(
+        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+    tmp = _mm_unpacklo_epi16(a, b);
+    tmp2 = _mm_unpacklo_epi16(c, d);
+    sum_sq =
+        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 3; i < height - 4; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i - 3) * src_stride + j])));
+      y = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)&src[(i + 4) * src_stride + j])));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu8_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+  }
+}
+
+static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
+                                       int height, int buf_stride, int eps,
+                                       int bit_depth) {
+  int i, j;
+  // Horizontal sum over 7-pixel regions of dst
+  int width_extend = (width + 3) & ~3;
+  for (i = 0; i < height; ++i) {
+    int h = AOMMIN(4, height - i) + AOMMIN(3, i);
+
+    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
+    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
+    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
+    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
+
+    __m128i sum_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
+            _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
+        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
+                                    _mm_alignr_epi8(b2, b1, 8)),
+                      _mm_alignr_epi8(b2, b1, 12)));
+    __m128i sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(
+            _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
+            _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
+        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
+                                    _mm_alignr_epi8(a2, a1, 8)),
+                      _mm_alignr_epi8(a2, a1, 12)));
+
+    __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
+    __m128i one_over_n =
+        _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
+                      one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
+    __m128i s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
+        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A,
+               B);
+
+    // Re-align a1 and b1 so that they start at index i * buf_stride + 1
+    a2 = _mm_alignr_epi8(a2, a1, 4);
+    b2 = _mm_alignr_epi8(b2, b1, 4);
+
+    n = _mm_set1_epi32(7 * h);
+    one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
+    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
+
+    for (j = 4; j < width_extend - 4; j += 4) {
+      a1 = a2;
+      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+      b1 = b2;
+      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+      __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
+      __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
+      /* Loop invariant: At this point,
+         a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
+         a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
+         a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
+         and similar for b1,b2,b3 and B
+      */
+      sum_ = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+                        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+                                      _mm_alignr_epi8(b2, b1, 12))),
+          _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
+                        _mm_alignr_epi8(b3, b2, 8)));
+      sum_sq_ = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+                        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+                                      _mm_alignr_epi8(a2, a1, 12))),
+          _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
+                        _mm_alignr_epi8(a3, a2, 8)));
+
+      calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+                 A, B);
+    }
+    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
+    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
+
+    j = width - 4;
+    switch (width % 4) {
+      case 0:
+        a1 = a2;
+        b1 = b2;
+        a2 = a3;
+        b2 = b3;
+        break;
+      case 1:
+        a1 = _mm_alignr_epi8(a2, a1, 4);
+        b1 = _mm_alignr_epi8(b2, b1, 4);
+        a2 = _mm_alignr_epi8(a3, a2, 4);
+        b2 = _mm_alignr_epi8(b3, b2, 4);
+        break;
+      case 2:
+        a1 = _mm_alignr_epi8(a2, a1, 8);
+        b1 = _mm_alignr_epi8(b2, b1, 8);
+        a2 = _mm_alignr_epi8(a3, a2, 8);
+        b2 = _mm_alignr_epi8(b3, b2, 8);
+        break;
+      case 3:
+        a1 = _mm_alignr_epi8(a2, a1, 12);
+        b1 = _mm_alignr_epi8(b2, b1, 12);
+        a2 = _mm_alignr_epi8(a3, a2, 12);
+        b2 = _mm_alignr_epi8(b3, b2, 12);
+        break;
+    }
+
+    // Zero out the data loaded from "off the edge" of the array
+    __m128i zero = _mm_setzero_si128();
+    a2 = _mm_blend_epi16(a2, zero, 0xc0);
+    b2 = _mm_blend_epi16(b2, zero, 0xc0);
+
+    sum_ = _mm_add_epi32(
+        _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
+                      _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
+                                    _mm_alignr_epi8(b2, b1, 12))),
+        _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
+                      _mm_alignr_epi8(zero, b2, 8)));
+    sum_sq_ = _mm_add_epi32(
+        _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
+                      _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
+                                    _mm_alignr_epi8(a2, a1, 12))),
+        _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
+                      _mm_alignr_epi8(zero, a2, 8)));
+
+    n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
+    one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
+                               one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
+    s = _mm_set_epi32(
+        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
+        sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
+    calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
+               A, B);
+  }
+}
+
+void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
+                                       int stride, int32_t *dst, int dst_stride,
+                                       int r, int eps, int32_t *tmpbuf) {
+  int32_t *A = tmpbuf;
+  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+  int i, j;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width + 3) & ~3) + 16;
+
+  // Don't filter tiles with dimensions < 5 on any axis
+  if ((width < 5) || (height < 5)) return;
+
+  if (r == 1) {
+    selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+  } else if (r == 2) {
+    selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+  } else if (r == 3) {
+    selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
+    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+  } else {
+    assert(0);
+  }
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k + 1] + A[k - buf_stride + 1] +
+                        A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k + 1] + B[k - buf_stride + 1] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    // Vectorize the innermost loop
+    for (j = 1; j < width - 1; j += 4) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+
+      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+      __m128i a0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
+          _mm_alignr_epi8(tmp1, tmp0, 4));
+      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
+      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+      __m128i b0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
+          _mm_alignr_epi8(tmp7, tmp6, 4));
+      __m128i b1 =
+          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
+      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+      __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+      __m128i rounding = _mm_set1_epi32(
+          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      _mm_storeu_si128((__m128i *)&dst[m], w);
+    }
+
+    // Deal with any extra pixels at the right-hand edge of the frame
+    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+    for (; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k - 1] + A[k - buf_stride - 1] +
+                        A[k + buf_stride - 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k - 1] + B[k - buf_stride - 1] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
+                                int32_t *dst, int dst_stride, int corner,
+                                int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+    }
+  }
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+    }
+  }
+  __m128i center_ = _mm_set1_epi16(center);
+  __m128i edge_ = _mm_set1_epi16(edge);
+  __m128i corner_ = _mm_set1_epi16(corner);
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+    // Process in units of 8 pixels at a time.
+    for (j = 1; j < width - 8; j += 8) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu8_epi16(a);
+      __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
+      __m128i cl = _mm_cvtepu8_epi16(b);
+      __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
+      __m128i bl = _mm_cvtepu8_epi16(c);
+      __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
+
+      __m128i x = _mm_alignr_epi8(cr, cl, 2);
+      __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
+                                _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
+                                              _mm_alignr_epi8(cr, cl, 4)));
+      __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
+                                _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
+                                              _mm_alignr_epi8(br, bl, 4)));
+
+      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+                                                _mm_mullo_epi16(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+      _mm_storeu_si128((__m128i *)&dst[l + 4],
+                       _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
+    }
+    // If there are enough pixels left in this row, do another batch of 4
+    // pixels.
+    for (; j < width - 4; j += 4) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu8_epi16(a);
+      __m128i cl = _mm_cvtepu8_epi16(b);
+      __m128i bl = _mm_cvtepu8_epi16(c);
+
+      __m128i x = _mm_srli_si128(cl, 2);
+      __m128i y = _mm_add_epi16(
+          _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
+          _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
+      __m128i z = _mm_add_epi16(
+          _mm_add_epi16(tl, bl),
+          _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
+
+      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
+                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
+                                                _mm_mullo_epi16(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
+    }
+    // Handle any leftover pixels
+    for (; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
+                                         int stride, int eps, int *xqd,
+                                         uint8_t *dst, int dst_stride,
+                                         int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
+                             sgr_params[eps].corner, sgr_params[eps].edge);
+#else
+    av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
+                                      sgr_params[eps].r1, sgr_params[eps].e1,
+                                      tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
+                                    sgr_params[eps].r2, sgr_params[eps].e2,
+                                    tmpbuf2);
+  decode_xq(xqd, xq);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+  for (i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      __m128i src =
+          _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
+                         SGRPROJ_RST_BITS);
+
+      const __m128i u_0 = _mm_cvtepu16_epi32(src);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+      const __m128i f1_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+      const __m128i f2_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+      const __m128i f1_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+      const __m128i f2_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+      const __m128i v_0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+      const __m128i v_1 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const __m128i rounding =
+          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+      const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+      _mm_storel_epi64((__m128i *)&dst[m], res);
+    }
+    // Process leftover pixels
+    for (; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel(w);
+    }
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+// Only the vertical sums need to be adjusted for highbitdepth
+
+static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_madd_epi16(tmp, tmp);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 1; i < height - 2; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+  }
+}
+
+static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, c2, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
+    // Important: We need to widen *before* squaring here, since
+    // c^2 may be up to 2^24.
+    c = _mm_cvtepu16_epi32(c);
+    c2 = _mm_mullo_epi32(c, c);
+    tmp = _mm_unpacklo_epi16(a, b);
+    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 2; i < height - 3; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+  }
+}
+
+static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
+                                              int height, int src_stride,
+                                              int32_t *A, int32_t *B,
+                                              int buf_stride) {
+  int i, j;
+
+  int width_extend = (width + 3) & ~3;
+  for (j = 0; j < width_extend; j += 4) {
+    __m128i a, b, c, d, x, y, x2, y2;
+    __m128i sum, sum_sq, tmp, tmp2;
+
+    a = _mm_loadl_epi64((__m128i *)&src[j]);
+    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
+    d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
+
+    sum = _mm_cvtepi16_epi32(
+        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
+    tmp = _mm_unpacklo_epi16(a, b);
+    tmp2 = _mm_unpacklo_epi16(c, d);
+    sum_sq =
+        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+
+    _mm_store_si128((__m128i *)&B[j], sum);
+    _mm_store_si128((__m128i *)&A[j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
+    sum = _mm_add_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_add_epi32(sum_sq, x2);
+
+    for (i = 3; i < height - 4; ++i) {
+      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+      x = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+      y = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
+
+      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+
+      x2 = _mm_mullo_epi32(x, x);
+      y2 = _mm_mullo_epi32(y, y);
+
+      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+    }
+    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
+
+    x = _mm_cvtepu16_epi32(
+        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
+    sum = _mm_sub_epi32(sum, x);
+    x2 = _mm_mullo_epi32(x, x);
+    sum_sq = _mm_sub_epi32(sum_sq, x2);
+
+    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
+    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
+  }
+}
+
+void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
+                                              int height, int stride,
+                                              int32_t *dst, int dst_stride,
+                                              int bit_depth, int r, int eps,
+                                              int32_t *tmpbuf) {
+  int32_t *A = tmpbuf;
+  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+  int i, j;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width + 3) & ~3) + 16;
+
+  // Don't filter tiles with dimensions < 5 on any axis
+  if ((width < 5) || (height < 5)) return;
+
+  if (r == 1) {
+    highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else if (r == 2) {
+    highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else if (r == 3) {
+    highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
+                                      buf_stride);
+    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+  } else {
+    assert(0);
+  }
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
+                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
+                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
+                        A[k + buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k + 1] + A[k - buf_stride + 1] +
+                        A[k + buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k + 1] + B[k - buf_stride + 1] +
+                        B[k + buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    // Vectorize the innermost loop
+    for (j = 1; j < width - 1; j += 4) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+
+      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
+      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
+      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
+      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
+      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
+      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
+
+      __m128i a0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
+                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
+          _mm_alignr_epi8(tmp1, tmp0, 4));
+      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
+                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
+                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
+      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
+
+      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
+      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
+      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
+      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
+      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
+      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
+
+      __m128i b0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
+          _mm_alignr_epi8(tmp7, tmp6, 4));
+      __m128i b1 =
+          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
+                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
+                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
+      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
+
+      __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
+
+      __m128i rounding = _mm_set1_epi32(
+          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
+      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      _mm_storeu_si128((__m128i *)&dst[m], w);
+    }
+
+    // Deal with any extra pixels at the right-hand edge of the frame
+    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
+    for (; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
+                        A[k - 1] + A[k - buf_stride - 1] +
+                        A[k + buf_stride - 1];
+      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
+                        B[k - 1] + B[k - buf_stride - 1] +
+                        B[k + buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride + 1];
+      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
+                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
+      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
+                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+    j = width - 1;
+    {
+      const int k = i * buf_stride + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 3;
+      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
+                        A[k - buf_stride - 1];
+      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
+                        B[k - buf_stride - 1];
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
+                                       int stride, int32_t *dst, int dst_stride,
+                                       int corner, int edge) {
+  int i, j;
+  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
+
+  {
+    i = 0;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
+    }
+  }
+  __m128i center_ = _mm_set1_epi32(center);
+  __m128i edge_ = _mm_set1_epi32(edge);
+  __m128i corner_ = _mm_set1_epi32(corner);
+  for (i = 1; i < height - 1; ++i) {
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+    // Process 4 pixels at a time
+    for (j = 1; j < width - 4; j += 4) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+
+      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
+      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
+      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
+
+      __m128i tl = _mm_cvtepu16_epi32(a);
+      __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+      __m128i cl = _mm_cvtepu16_epi32(b);
+      __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
+      __m128i bl = _mm_cvtepu16_epi32(c);
+      __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
+
+      __m128i x = _mm_alignr_epi8(cr, cl, 4);
+      __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
+                                _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
+                                              _mm_alignr_epi8(cr, cl, 8)));
+      __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
+                                _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
+                                              _mm_alignr_epi8(br, bl, 8)));
+
+      __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
+                                  _mm_add_epi32(_mm_mullo_epi32(y, edge_),
+                                                _mm_mullo_epi32(z, corner_)));
+
+      _mm_storeu_si128((__m128i *)&dst[l], res);
+    }
+    // Handle any leftover pixels
+    for (; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] +
+          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
+          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
+                    dgd[k - stride] + dgd[k + stride]);
+    }
+  }
+  {
+    i = height - 1;
+    j = 0;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
+    }
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] = center * dgd[k] +
+               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
+               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
+                         dgd[k - 1] + dgd[k + 1]);
+    }
+    j = width - 1;
+    {
+      const int k = i * stride + j;
+      const int l = i * dst_stride + j;
+      dst[l] =
+          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
+          corner *
+              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
+    }
+  }
+}
+
+void apply_selfguided_restoration_highbd_sse4_1(
+    uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
+    int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+  int xq[2];
+  int32_t *flt1 = tmpbuf;
+  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+  int i, j;
+  assert(width * height <= RESTORATION_TILEPELS_MAX);
+#if USE_HIGHPASS_IN_SGRPROJ
+  av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
+                                    sgr_params[eps].corner,
+                                    sgr_params[eps].edge);
+#else
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
+                                           width, bit_depth, sgr_params[eps].r1,
+                                           sgr_params[eps].e1, tmpbuf2);
+#endif  // USE_HIGHPASS_IN_SGRPROJ
+  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
+                                           width, bit_depth, sgr_params[eps].r2,
+                                           sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+  for (i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      __m128i src =
+          _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
+
+      const __m128i u_0 = _mm_cvtepu16_epi32(src);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
+
+      const __m128i f1_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
+      const __m128i f2_0 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
+      const __m128i f1_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
+      const __m128i f2_1 =
+          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
+
+      const __m128i v_0 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
+          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
+      const __m128i v_1 = _mm_add_epi32(
+          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
+          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const __m128i rounding =
+          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      // Pack into 16 bits and clamp to [0, 2^bit_depth)
+      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+      const __m128i res = _mm_min_epi16(tmp, max);
+
+      _mm_store_si128((__m128i *)&dst[m], res);
+    }
+    // Process leftover pixels
+    for (; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int m = i * dst_stride + j;
+      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
+      const int32_t f1 = (int32_t)flt1[k] - u;
+      const int32_t f2 = (int32_t)flt2[k] - u;
+      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+    }
+  }
+}
+
+#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..925e4650d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
+                          int stride, uint8_t *pred, int p_col, int p_row,
+                          int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y, int ref_frm,
+                          int16_t alpha, int16_t beta, int16_t gamma,
+                          int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      // (x, y) coordinates of the center of this block in the destination
+      // image
+      int32_t dst_x = p_col + j + 4;
+      int32_t dst_y = p_row + i + 4;
+
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        // If the block is aligned such that, after clamping, every sample
+        // would be taken from the leftmost/rightmost column, then we can
+        // skip the expensive horizontal filter.
+        if (ix4 <= -7) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else if (ix4 >= width + 6) {
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride + (width - 1)] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k +
+                   // Include rounding and offset here
+                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+          // Load source pixels
+          __m128i zero = _mm_setzero_si128();
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+          // Filter even-index pixels
+          __m128i tmp_0 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_2 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_4 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_6 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+          __m128i round_const =
+              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
+          // Calculate filtered results
+          __m128i src_0 = _mm_unpacklo_epi8(src, zero);
+          __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+          __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+          __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+          __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+          __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+          __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+          __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                    HORSHEAR_REDUCE_PREC_BITS);
+
+          // Filter odd-index pixels
+          __m128i tmp_1 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_3 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_5 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+          __m128i tmp_7 = _mm_loadu_si128(
+              (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+          __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+          __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+          __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+          __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+          __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+          __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+          __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+          __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                   HORSHEAR_REDUCE_PREC_BITS);
+
+          // Combine results into one register.
+          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+          // as this order helps with the vertical filter.
+          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k +
+                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        __m128i *src = tmp + (k + 4);
+        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                         _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                        _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        // Round and pack into 8 bits
+        __m128i round_const =
+            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+
+        __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+        __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        // Store, blending with 'pred' if needed
+        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+        // Note: If we're outputting a 4x4 block, we need to be very careful
+        // to only output 4 pixels at this point, to avoid encode/decode
+        // mismatches when encoding with multiple threads.
+        if (p_width == 4) {
+          if (ref_frm) {
+            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+            res_8bit = _mm_avg_epu8(res_8bit, orig);
+          }
+          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+        } else {
+          if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+          _mm_storel_epi64(p, res_8bit);
+        }
+      }
+    }
+  }
+}
-- 
cgit v1.2.3