67 files changed, 4413 insertions, 1659 deletions
diff --git a/third_party/aom/test/ans_test.cc b/third_party/aom/test/ans_test.cc
index a553a9e84..fd460f409 100644
--- a/third_party/aom/test/ans_test.cc
+++ b/third_party/aom/test/ans_test.cc
@@ -51,7 +51,8 @@ PvVec abs_encode_build_vals(int iters) {
 
 bool check_rabs(const PvVec &pv_vec, uint8_t *buf) {
   BufAnsCoder a;
-  aom_buf_ans_alloc(&a, NULL, kBufAnsSize);
+  a.size = kBufAnsSize;
+  aom_buf_ans_alloc(&a, NULL);
   buf_ans_write_init(&a, buf);
 
   std::clock_t start = std::clock();
@@ -125,7 +126,8 @@ void rans_build_dec_tab(const struct rans_sym sym_tab[],
 bool check_rans(const std::vector<int> &sym_vec, const rans_sym *const tab,
                 uint8_t *buf) {
   BufAnsCoder a;
-  aom_buf_ans_alloc(&a, NULL, kBufAnsSize);
+  a.size = kBufAnsSize;
+  aom_buf_ans_alloc(&a, NULL);
   buf_ans_write_init(&a, buf);
   aom_cdf_prob dec_tab[kRansSymbols];
   rans_build_dec_tab(tab, dec_tab);
diff --git a/third_party/aom/test/aq_segment_test.cc b/third_party/aom/test/aq_segment_test.cc
index 026b0022b..57db0d0ff 100644
--- a/third_party/aom/test/aq_segment_test.cc
+++ b/third_party/aom/test/aq_segment_test.cc
@@ -90,7 +90,7 @@ TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) { DoTest(2); }
 
 TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) { DoTest(3); }
 
-#if CONFIG_DELTA_Q & !CONFIG_EXT_DELTA_Q
+#if !CONFIG_EXT_DELTA_Q
 // Validate that this AQ mode (AQ=4, delta q)
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchAQ4) {
diff --git a/third_party/aom/test/av1_convolve_2d_test.cc b/third_party/aom/test/av1_convolve_2d_test.cc
index b066dd4f8..002ede403 100644
--- a/third_party/aom/test/av1_convolve_2d_test.cc
+++ b/third_party/aom/test/av1_convolve_2d_test.cc
@@ -22,14 +22,14 @@ using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DTest;
 
 namespace {
 
-TEST_P(AV1Convolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+TEST_P(AV1Convolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(2)); }
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AV1Convolve2DTest,
     libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sse2));
 
 #if CONFIG_HIGHBITDEPTH && HAVE_SSSE3
-TEST_P(AV1HighbdConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(4)); }
+TEST_P(AV1HighbdConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
 
 INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc
index 8cec216af..3b61f6bb7 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.cc
+++ b/third_party/aom/test/av1_convolve_2d_test_util.cc
@@ -23,9 +23,9 @@ namespace AV1Convolve2D {
 ::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
     convolve_2d_func filter) {
   const Convolve2DParam params[] = {
-    make_tuple(4, 4, 20, filter),  make_tuple(8, 8, 10, filter),
-    make_tuple(64, 64, 1, filter), make_tuple(4, 16, 10, filter),
-    make_tuple(32, 8, 5, filter),
+    make_tuple(4, 4, filter),   make_tuple(8, 8, filter),
+    make_tuple(64, 64, filter), make_tuple(4, 16, filter),
+    make_tuple(32, 8, filter),
   };
   return ::testing::ValuesIn(params);
 }
@@ -38,7 +38,6 @@ void AV1Convolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
 void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
   const int w = 128, h = 128;
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int num_iters = GET_PARAM(2);
   int i, j, k;
 
   uint8_t *input = new uint8_t[h * w];
@@ -50,9 +49,6 @@ void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
 
-  memset(output, 0, output_n * sizeof(CONV_BUF_TYPE));
-  memset(output2, 0, output_n * sizeof(CONV_BUF_TYPE));
-
   int hfilter, vfilter, subx, suby;
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
@@ -60,13 +56,20 @@ void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
           av1_get_interp_filter_params((InterpFilter)hfilter);
       InterpFilterParams filter_params_y =
           av1_get_interp_filter_params((InterpFilter)vfilter);
+      const int do_average = rnd_.Rand8() & 1;
       ConvolveParams conv_params1 =
-          get_conv_params_no_round(0, 0, 0, output, MAX_SB_SIZE);
+          get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE);
       ConvolveParams conv_params2 =
-          get_conv_params_no_round(0, 0, 0, output2, MAX_SB_SIZE);
+          get_conv_params_no_round(0, do_average, 0, output2, MAX_SB_SIZE);
 
       for (subx = 0; subx < 16; ++subx)
         for (suby = 0; suby < 16; ++suby) {
+          // av1_convolve_2d is designed for accumulate two predicted blocks for
+          // compound mode, so we set num_iter to two here.
+          // A larger number may introduce overflow
+          const int num_iters = 2;
+          memset(output, 0, output_n * sizeof(*output));
+          memset(output2, 0, output_n * sizeof(*output2));
           for (i = 0; i < num_iters; ++i) {
             // Choose random locations within the source block
             int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
@@ -102,14 +105,14 @@ namespace AV1HighbdConvolve2D {
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
     highbd_convolve_2d_func filter) {
   const HighbdConvolve2DParam params[] = {
-    make_tuple(4, 4, 20, 8, filter),   make_tuple(8, 8, 10, 8, filter),
-    make_tuple(64, 64, 1, 8, filter),  make_tuple(4, 16, 10, 8, filter),
-    make_tuple(32, 8, 10, 8, filter),  make_tuple(4, 4, 20, 10, filter),
-    make_tuple(8, 8, 10, 10, filter),  make_tuple(64, 64, 1, 10, filter),
-    make_tuple(4, 16, 10, 10, filter), make_tuple(32, 8, 10, 10, filter),
-    make_tuple(4, 4, 20, 12, filter),  make_tuple(8, 8, 10, 12, filter),
-    make_tuple(64, 64, 1, 12, filter), make_tuple(4, 16, 10, 12, filter),
-    make_tuple(32, 8, 10, 12, filter),
+    make_tuple(4, 4, 8, filter),    make_tuple(8, 8, 8, filter),
+    make_tuple(64, 64, 8, filter),  make_tuple(4, 16, 8, filter),
+    make_tuple(32, 8, 8, filter),   make_tuple(4, 4, 10, filter),
+    make_tuple(8, 8, 10, filter),   make_tuple(64, 64, 10, filter),
+    make_tuple(4, 16, 10, filter),  make_tuple(32, 8, 10, filter),
+    make_tuple(4, 4, 12, filter),   make_tuple(8, 8, 12, filter),
+    make_tuple(64, 64, 12, filter), make_tuple(4, 16, 12, filter),
+    make_tuple(32, 8, 12, filter),
   };
   return ::testing::ValuesIn(params);
 }
@@ -125,8 +128,7 @@ void AV1HighbdConvolve2DTest::RunCheckOutput(
     highbd_convolve_2d_func test_impl) {
   const int w = 128, h = 128;
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int num_iters = GET_PARAM(2);
-  const int bd = GET_PARAM(3);
+  const int bd = GET_PARAM(2);
   int i, j, k;
 
   uint16_t *input = new uint16_t[h * w];
@@ -138,9 +140,6 @@ void AV1HighbdConvolve2DTest::RunCheckOutput(
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
 
-  memset(output, 0, output_n * sizeof(CONV_BUF_TYPE));
-  memset(output2, 0, output_n * sizeof(CONV_BUF_TYPE));
-
   int hfilter, vfilter, subx, suby;
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
@@ -155,6 +154,12 @@ void AV1HighbdConvolve2DTest::RunCheckOutput(
 
       for (subx = 0; subx < 16; ++subx)
         for (suby = 0; suby < 16; ++suby) {
+          // av1_convolve_2d is designed for accumulate two predicted blocks for
+          // compound mode, so we set num_iter to two here.
+          // A larger number may introduce overflow
+          const int num_iters = 2;
+          memset(output, 0, output_n * sizeof(*output));
+          memset(output2, 0, output_n * sizeof(*output2));
           for (i = 0; i < num_iters; ++i) {
             // Choose random locations within the source block
             int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h
index ed0eeb450..013126b4a 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.h
+++ b/third_party/aom/test/av1_convolve_2d_test_util.h
@@ -31,7 +31,7 @@ typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params);
 
-typedef std::tr1::tuple<int, int, int, convolve_2d_func> Convolve2DParam;
+typedef std::tr1::tuple<int, int, convolve_2d_func> Convolve2DParam;
 
 ::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
     convolve_2d_func filter);
@@ -59,7 +59,7 @@ typedef void (*highbd_convolve_2d_func)(
     InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 
-typedef std::tr1::tuple<int, int, int, int, highbd_convolve_2d_func>
+typedef std::tr1::tuple<int, int, int, highbd_convolve_2d_func>
     HighbdConvolve2DParam;
 
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
new file mode 100644
index 000000000..9d8be888d
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+const int kXStepQn = 16;
+const int kYStepQn = 20;
+
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+using libaom_test::ACMRandom;
+
+enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
+
+// A 16-bit filter with a configurable number of taps.
+class TestFilter {
+ public:
+  void set(NTaps ntaps, bool backwards);
+
+  InterpFilterParams params_;
+
+ private:
+  std::vector<int16_t> coeffs_;
+};
+
+void TestFilter::set(NTaps ntaps, bool backwards) {
+  const int n = NTapsToInt(ntaps);
+  assert(n >= 8 && n <= 12);
+
+  // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus
+  // elements at the end so that convolutions can read off the end safely.
+  coeffs_.resize(n * SUBPEL_SHIFTS + 8);
+
+  // The coefficients are pretty much arbitrary, but convolutions shouldn't
+  // over or underflow. For the first filter (subpels = 0), we use an
+  // increasing or decreasing ramp (depending on the backwards parameter). We
+  // don't want any zero coefficients, so we make it have an x-intercept at -1
+  // or n. To ensure absence of under/overflow, we normalise the area under the
+  // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function
+  // gives the identity).
+  //
+  // When increasing, the function has the form:
+  //
+  //   f(x) = A * (x + 1)
+  //
+  // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the
+  // filter is reversed, we have the same A but with formula
+  //
+  //   g(x) = A * (n - x)
+  const int I = 1 << FILTER_BITS;
+  const float A = 2.f * I / (n * (n + 1.f));
+  for (int i = 0; i < n; ++i) {
+    coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1)));
+  }
+
+  // For the other filters, make them slightly different by swapping two
+  // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped.
+  const size_t filter_size = sizeof(coeffs_[0] * n);
+  int16_t *const filter0 = &coeffs_[0];
+  for (int k = 1; k < SUBPEL_SHIFTS; ++k) {
+    int16_t *filterk = &coeffs_[k * n];
+    memcpy(filterk, filter0, filter_size);
+
+    const int idx0 = k % n;
+    const int idx1 = (7 * k) % n;
+
+    const int16_t tmp = filterk[idx0];
+    filterk[idx0] = filterk[idx1];
+    filterk[idx1] = tmp;
+  }
+
+  // Finally, write some rubbish at the end to make sure we don't use it.
+  for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i;
+
+  // Fill in params
+  params_.filter_ptr = &coeffs_[0];
+  params_.taps = n;
+  // These are ignored by the functions being tested. Set them to whatever.
+  params_.subpel_shifts = SUBPEL_SHIFTS;
+  params_.interp_filter = EIGHTTAP_REGULAR;
+}
+
+template <typename SrcPixel>
+class TestImage {
+ public:
+  TestImage(int w, int h, int bd) : w_(w), h_(h), bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(SrcPixel)));
+
+    // Pad width by 2*kHPad and then round up to the next multiple of 16
+    // to get src_stride_. Add another 16 for dst_stride_ (to make sure
+    // something goes wrong if we use the wrong one)
+    src_stride_ = (w_ + 2 * kHPad + 15) & ~15;
+    dst_stride_ = src_stride_ + 16;
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  const SrcPixel *GetSrcData(bool ref, bool borders) const {
+    const SrcPixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  int32_t *GetDstData(bool ref, bool borders) {
+    int32_t *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_, h_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<SrcPixel> src_data_;
+  std::vector<int32_t> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Check() const {
+  // If memcmp returns 0, there's nothing to do.
+  const int num_pixels = dst_block_size();
+  const int32_t *ref_dst = &dst_data_[0];
+  const int32_t *tst_dst = &dst_data_[num_pixels];
+
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+  // Otherwise, iterate through the buffer looking for differences (including
+  // the edges)
+  const int stride = dst_stride_;
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
+}
+
+typedef tuple<int, int> BlockDimension;
+
+struct BaseParams {
+  BaseParams(BlockDimension dims, NTaps ntaps_x, NTaps ntaps_y, bool avg)
+      : dims(dims), ntaps_x(ntaps_x), ntaps_y(ntaps_y), avg(avg) {}
+
+  BlockDimension dims;
+  NTaps ntaps_x, ntaps_y;
+  bool avg;
+};
+
+template <typename SrcPixel>
+class ConvolveScaleTestBase : public ::testing::Test {
+ public:
+  ConvolveScaleTestBase() : image_(NULL) {}
+  virtual ~ConvolveScaleTestBase() { delete image_; }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetParams(const BaseParams &params, int bd) {
+    width_ = std::tr1::get<0>(params.dims);
+    height_ = std::tr1::get<1>(params.dims);
+    ntaps_x_ = params.ntaps_x;
+    ntaps_y_ = params.ntaps_y;
+    bd_ = bd;
+    avg_ = params.avg;
+
+    filter_x_.set(ntaps_x_, false);
+    filter_y_.set(ntaps_y_, true);
+    convolve_params_ = get_conv_params_no_round(0, avg_ != false, 0, NULL, 0);
+
+    delete image_;
+    image_ = new TestImage<SrcPixel>(width_, height_, bd_);
+  }
+
+  void Run() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      Prep(&rnd);
+      RunOne(true);
+      RunOne(false);
+      image_->Check();
+    }
+  }
+
+  void SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  static int RandomSubpel(ACMRandom *rnd) {
+    const uint8_t subpel_mode = rnd->Rand8();
+    if ((subpel_mode & 7) == 0) {
+      return 0;
+    } else if ((subpel_mode & 7) == 1) {
+      return SCALE_SUBPEL_SHIFTS - 1;
+    } else {
+      return 1 + rnd->PseudoUniform(SCALE_SUBPEL_SHIFTS - 2);
+    }
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+
+    // Choose subpel_x_ and subpel_y_. They should be less than
+    // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting"
+    // values: 0 and SCALE_SUBPEL_SHIFTS - 1
+    subpel_x_ = RandomSubpel(rnd);
+    subpel_y_ = RandomSubpel(rnd);
+
+    image_->Initialize(rnd);
+  }
+
+  int width_, height_, bd_;
+  NTaps ntaps_x_, ntaps_y_;
+  bool avg_;
+  int subpel_x_, subpel_y_;
+  TestFilter filter_x_, filter_y_;
+  TestImage<SrcPixel> *image_;
+  ConvolveParams convolve_params_;
+};
+
+typedef tuple<int, int> BlockDimension;
+
+typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
+                                  int32_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg>
+typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
+    LowBDParams;
+
+class LowBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const int bd = 8;
+    const bool avg = GET_PARAM(4);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+
+    if (ref) {
+      av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
+                              &filter_x_.params_, &filter_y_.params_, subpel_x_,
+                              kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_);
+    }
+  }
+
+ private:
+  LowbdConvolveFunc tst_fun_;
+};
+
+const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),    make_tuple(2, 4),    make_tuple(4, 4),
+  make_tuple(4, 8),    make_tuple(8, 4),    make_tuple(8, 8),
+  make_tuple(8, 16),   make_tuple(16, 8),   make_tuple(16, 16),
+  make_tuple(16, 32),  make_tuple(32, 16),  make_tuple(32, 32),
+  make_tuple(32, 64),  make_tuple(64, 32),  make_tuple(64, 64),
+  make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
+};
+
+const NTaps kNTaps[] = { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+
+TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
+TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool()));
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+                                   int32_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int x_step_qn,
+                                   const int subpel_y_qn, const int y_step_qn,
+                                   ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+    HighBDParams;
+
+class HighBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const bool avg = GET_PARAM(4);
+    const int bd = GET_PARAM(5);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+
+    if (ref) {
+      av1_highbd_convolve_2d_scale_c(
+          src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+          &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+          &convolve_params_, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_, bd_);
+    }
+  }
+
+ private:
+  HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, HighBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+
+#endif  // CONFIG_HIGHBITDEPTH
+}  // namespace
diff --git a/third_party/aom/test/av1_convolve_test.cc b/third_party/aom/test/av1_convolve_test.cc
index 3947c7166..aaef7cfe0 100644
--- a/third_party/aom/test/av1_convolve_test.cc
+++ b/third_party/aom/test/av1_convolve_test.cc
@@ -269,16 +269,9 @@ INSTANTIATE_TEST_CASE_P(
 #ifndef __clang_analyzer__
 TEST(AV1ConvolveTest, av1_highbd_convolve) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
-                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   InterpFilterParams filter_params =
-      av1_get_interp_filter_params(interp_filter[0]);
-#else
-  InterpFilter interp_filter = EIGHTTAP_REGULAR;
-  InterpFilterParams filter_params =
-      av1_get_interp_filter_params(interp_filter);
-#endif
+      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
   int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint16_t src[12 * 12];
@@ -303,7 +296,7 @@ TEST(AV1ConvolveTest, av1_highbd_convolve) {
     for (subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; subpel_y_q4++) {
       av1_highbd_convolve(
           CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
-          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filter,
+          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filters,
           subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
 
       const int16_t *x_filter =
@@ -331,16 +324,9 @@ TEST(AV1ConvolveTest, av1_highbd_convolve) {
 
 TEST(AV1ConvolveTest, av1_highbd_convolve_avg) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
-                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   InterpFilterParams filter_params =
-      av1_get_interp_filter_params(interp_filter[0]);
-#else
-  InterpFilter interp_filter = EIGHTTAP_REGULAR;
-  InterpFilterParams filter_params =
-      av1_get_interp_filter_params(interp_filter);
-#endif
+      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
   int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint16_t src0[12 * 12];
@@ -373,23 +359,23 @@ TEST(AV1ConvolveTest, av1_highbd_convolve_avg) {
       avg = 0;
       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
-                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
       avg = 0;
       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
-                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
 
       avg = 0;
       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
       avg = 1;
       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
 
       EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
diff --git a/third_party/aom/test/av1_dct_test.cc b/third_party/aom/test/av1_dct_test.cc
index 8ce7a79d4..fdaf9abb9 100644
--- a/third_party/aom/test/av1_dct_test.cc
+++ b/third_party/aom/test/av1_dct_test.cc
@@ -23,7 +23,8 @@
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #define AV1_DCT_GTEST
 #include "av1/encoder/dct.c"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32
 #include "av1/common/daala_tx.c"
 #endif
 
diff --git a/third_party/aom/test/av1_fht16x16_test.cc b/third_party/aom/test/av1_fht16x16_test.cc
index c0f6974c6..21235a837 100644
--- a/third_party/aom/test/av1_fht16x16_test.cc
+++ b/third_party/aom/test/av1_fht16x16_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x16Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x16Param;
 
 void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
@@ -42,15 +42,15 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
 
 #if CONFIG_HIGHBITDEPTH
 typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           int tx_type, int bd);
+                           TX_TYPE tx_type, int bd);
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          int tx_type, int bd);
+                          TX_TYPE tx_type, int bd);
 
 // Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, int, int> HighbdHt16x16Param;
+typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt16x16Param;
 
 void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
-                         int tx_type, int bd) {
+                         TX_TYPE tx_type, int bd) {
   av1_fwd_txfm2d_16x16_c(in, out, stride, tx_type, bd);
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -128,7 +128,7 @@ class AV1HighbdTrans16x16HT
  private:
   HbdHtFunc fwd_txfm_;
   HbdHtFunc fwd_txfm_ref_;
-  int tx_type_;
+  TX_TYPE tx_type_;
   int bit_depth_;
   int mask_;
   int num_coeffs_;
@@ -164,113 +164,113 @@ TEST_P(AV1HighbdTrans16x16HT, HighbdCoeffCheck) { RunBitexactCheck(); }
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_DAALA_DCT16
 const Ht16x16Param kArrayHt16x16Param_sse2[] = {
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 0, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 1, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 2, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 3, AOM_BITS_8,
-             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_ADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_ADST,
+             AOM_BITS_8, 256),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 4, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 5, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 6, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 7, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 9, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, IDTX, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_DCT, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 12, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_DCT, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 13, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_ADST, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 14, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_ADST, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 15, AOM_BITS_8,
-             256)
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_FLIPADST,
+             AOM_BITS_8, 256)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT,
                         ::testing::ValuesIn(kArrayHt16x16Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_AVX2
+#if HAVE_AVX2 && !CONFIG_DAALA_DCT16
 const Ht16x16Param kArrayHt16x16Param_avx2[] = {
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 0, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 1, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 2, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 3, AOM_BITS_8,
-             256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_ADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_ADST,
+             AOM_BITS_8, 256),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 4, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 5, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 6, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 7, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 8, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 9, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_DCT,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_ADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, IDTX, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_DCT, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_DCT, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_ADST, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_ADST, AOM_BITS_8,
              256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 15, AOM_BITS_8,
-             256)
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_FLIPADST,
+             AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_FLIPADST,
+             AOM_BITS_8, 256)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
                         ::testing::ValuesIn(kArrayHt16x16Param_avx2));
 #endif  // HAVE_AVX2
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT16
 const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 0, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 0, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 1, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 1, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 2, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 2, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 3, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 3, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_ADST, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_ADST, 12),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 4, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 4, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 5, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 5, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 6, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 6, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 7, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 7, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 8, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 8, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_ADST, 12),
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans16x16HT,
                         ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT16
 
 }  // namespace
diff --git a/third_party/aom/test/av1_fht16x32_test.cc b/third_party/aom/test/av1_fht16x32_test.cc
index 099a312e3..0b3928f64 100644
--- a/third_party/aom/test/av1_fht16x32_test.cc
+++ b/third_party/aom/test/av1_fht16x32_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x32Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x32Param;
 
 void fht16x32_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
@@ -80,23 +80,34 @@ TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
 
 using std::tr1::make_tuple;
 const Ht16x32Param kArrayHt16x32Param_c[] = {
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 0, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 1, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 2, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 3, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_ADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_ADST, AOM_BITS_8,
+             512),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 4, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 5, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 6, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 7, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 8, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 9, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 10, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 11, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 12, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 13, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 14, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 15, AOM_BITS_8, 512)
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_FLIPADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, IDTX, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_DCT, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_DCT, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_ADST, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_ADST, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_FLIPADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_FLIPADST, AOM_BITS_8,
+             512)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans16x32HT,
@@ -104,39 +115,39 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans16x32HT,
 
 #if HAVE_SSE2
 const Ht16x32Param kArrayHt16x32Param_sse2[] = {
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 0, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 1, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 2, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 3, AOM_BITS_8,
-             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_ADST,
+             AOM_BITS_8, 512),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 4, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 5, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 6, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 7, AOM_BITS_8,
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, IDTX, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 8, AOM_BITS_8,
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_DCT, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 9, AOM_BITS_8,
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_DCT, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 10, AOM_BITS_8,
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_ADST, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 11, AOM_BITS_8,
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_ADST, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 12, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 13, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 14, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 15, AOM_BITS_8,
-             512)
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_FLIPADST,
+             AOM_BITS_8, 512)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x32HT,
diff --git a/third_party/aom/test/av1_fht16x8_test.cc b/third_party/aom/test/av1_fht16x8_test.cc
index 8277e2865..3ee1a0830 100644
--- a/third_party/aom/test/av1_fht16x8_test.cc
+++ b/third_party/aom/test/av1_fht16x8_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x8Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x8Param;
 
 void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride,
                  TxfmParam *txfm_param) {
@@ -81,23 +81,31 @@ TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 using std::tr1::make_tuple;
 
 const Ht16x8Param kArrayHt16x8Param_c[] = {
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 0, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 1, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 2, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 3, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_ADST, AOM_BITS_8,
+             128),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 4, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 5, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 6, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 7, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 8, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 9, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 10, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 11, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 12, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 13, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 14, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, 15, AOM_BITS_8, 128)
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, IDTX, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_FLIPADST, AOM_BITS_8,
+             128)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans16x8HT,
@@ -105,23 +113,39 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans16x8HT,
 
 #if HAVE_SSE2
 const Ht16x8Param kArrayHt16x8Param_sse2[] = {
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 0, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 1, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 2, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 3, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_ADST,
+             AOM_BITS_8, 128),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 4, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 5, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 6, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 7, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 8, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 9, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 10, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 11, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 12, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 13, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 14, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 15, AOM_BITS_8, 128)
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, IDTX, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_FLIPADST,
+             AOM_BITS_8, 128)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT,
diff --git a/third_party/aom/test/av1_fht32x16_test.cc b/third_party/aom/test/av1_fht32x16_test.cc
index 1c70fd4fc..cbce074e5 100644
--- a/third_party/aom/test/av1_fht32x16_test.cc
+++ b/third_party/aom/test/av1_fht32x16_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x16Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht32x16Param;
 
 void fht32x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
@@ -80,23 +80,34 @@ TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
 
 using std::tr1::make_tuple;
 const Ht32x16Param kArrayHt32x16Param_c[] = {
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 0, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 1, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 2, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 3, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_ADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_ADST, AOM_BITS_8,
+             512),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 4, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 5, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 6, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 7, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 8, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 9, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 10, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 11, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 12, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 13, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 14, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 15, AOM_BITS_8, 512)
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_DCT, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_FLIPADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, IDTX, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_DCT, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_DCT, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_ADST, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_ADST, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_FLIPADST, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_FLIPADST, AOM_BITS_8,
+             512)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans32x16HT,
@@ -104,39 +115,39 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans32x16HT,
 
 #if HAVE_SSE2
 const Ht32x16Param kArrayHt32x16Param_sse2[] = {
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 0, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 1, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 2, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 3, AOM_BITS_8,
-             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_ADST,
+             AOM_BITS_8, 512),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 4, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 5, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 6, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 7, AOM_BITS_8,
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, IDTX, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 8, AOM_BITS_8,
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_DCT, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 9, AOM_BITS_8,
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_DCT, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 10, AOM_BITS_8,
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_ADST, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 11, AOM_BITS_8,
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_ADST, AOM_BITS_8,
              512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 12, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 13, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 14, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 15, AOM_BITS_8,
-             512)
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_FLIPADST,
+             AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_FLIPADST,
+             AOM_BITS_8, 512)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x16HT,
diff --git a/third_party/aom/test/av1_fht32x32_test.cc b/third_party/aom/test/av1_fht32x32_test.cc
index e96ffffc2..613bc9183 100644
--- a/third_party/aom/test/av1_fht32x32_test.cc
+++ b/third_party/aom/test/av1_fht32x32_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x32Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht32x32Param;
 
 void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
@@ -37,20 +37,20 @@ void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride,
 
 #if CONFIG_HIGHBITDEPTH
 typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           int tx_type, int bd);
+                           TX_TYPE tx_type, int bd);
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          int tx_type, int bd);
+                          TX_TYPE tx_type, int bd);
 
 // Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, int, int> HighbdHt32x32Param;
+typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt32x32Param;
 
 void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
-                         int tx_type, int bd) {
+                         TX_TYPE tx_type, int bd) {
   av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
 }
 #endif  // CONFIG_HIGHBITDEPTH
 
-#if HAVE_SSE2 || HAVE_AVX2
+#if (HAVE_SSE2 || HAVE_AVX2) && !CONFIG_DAALA_DCT32
 void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
                     const TxfmParam *txfm_param) {
   (void)in;
@@ -129,7 +129,7 @@ class AV1HighbdTrans32x32HT
  private:
   HbdHtFunc fwd_txfm_;
   HbdHtFunc fwd_txfm_ref_;
-  int tx_type_;
+  TX_TYPE tx_type_;
   int bit_depth_;
   int mask_;
   int num_coeffs_;
@@ -165,53 +165,63 @@ TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_DAALA_DCT32
 const Ht32x32Param kArrayHt32x32Param_sse2[] = {
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 0, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 1, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 2, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_ADST, AOM_BITS_8, 1024),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 4, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 5, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 13, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 14, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_DCT, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_ADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, IDTX, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_FLIPADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_FLIPADST, AOM_BITS_8, 1024)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x32HT,
                         ::testing::ValuesIn(kArrayHt32x32Param_sse2));
-#endif  // HAVE_SSE2
+#endif  // HAVE_SSE2 && !CONFIG_DAALA_DCT32
 
-#if HAVE_AVX2
+#if HAVE_AVX2 && !CONFIG_DAALA_DCT32
 const Ht32x32Param kArrayHt32x32Param_avx2[] = {
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 0, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 1, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 2, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_ADST, AOM_BITS_8, 1024),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 4, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 5, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 13, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 14, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_DCT, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_FLIPADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_ADST, AOM_BITS_8,
+             1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, IDTX, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_DCT, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_ADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_FLIPADST, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_FLIPADST, AOM_BITS_8, 1024)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
                         ::testing::ValuesIn(kArrayHt32x32Param_avx2));
-#endif  // HAVE_AVX2
+#endif  // HAVE_AVX2 && !CONFIG_DAALA_DCT32
 }  // namespace
diff --git a/third_party/aom/test/av1_fht4x4_test.cc b/third_party/aom/test/av1_fht4x4_test.cc
index f49d7368d..1d4fc1352 100644
--- a/third_party/aom/test/av1_fht4x4_test.cc
+++ b/third_party/aom/test/av1_fht4x4_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x4Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht4x4Param;
 
 void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
                 TxfmParam *txfm_param) {
@@ -42,16 +42,16 @@ void iht4x4_ref(const tran_low_t *in, uint8_t *out, int stride,
 
 #if CONFIG_HIGHBITDEPTH
 typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                              int tx_type, int bd);
+                              TX_TYPE tx_type, int bd);
 typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
-                           int tx_type, int bd);
+                           TX_TYPE tx_type, int bd);
 
 // HighbdHt4x4Param argument list:
 // <Target optimized function, tx_type, bit depth>
-typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
+typedef tuple<HBDFhtFunc, TX_TYPE, int> HighbdHt4x4Param;
 
-void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride, int tx_type,
-                       int bd) {
+void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
+                       TX_TYPE tx_type, int bd) {
   av1_fwd_txfm2d_4x4_c(in, out, stride, tx_type, bd);
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -131,7 +131,7 @@ class AV1HighbdTrans4x4HT : public ::testing::TestWithParam<HighbdHt4x4Param> {
  private:
   HBDFhtFunc fwd_txfm_;
   HBDFhtFunc fwd_txfm_ref_;
-  int tx_type_;
+  TX_TYPE tx_type_;
   int bit_depth_;
   int mask_;
   int num_coeffs_;
@@ -167,58 +167,69 @@ TEST_P(AV1HighbdTrans4x4HT, HighbdCoeffCheck) { RunBitexactCheck(); }
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_DAALA_DCT4
 const Ht4x4Param kArrayHt4x4Param_sse2[] = {
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 0, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 1, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 2, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 3, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_DCT, AOM_BITS_8,
+             16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_DCT, AOM_BITS_8,
+             16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_ADST, AOM_BITS_8,
+             16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_ADST, AOM_BITS_8,
+             16),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 4, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 5, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 9, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 13, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 14, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 15, AOM_BITS_8, 16)
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, IDTX, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_DCT, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_DCT, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_ADST, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_ADST, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_FLIPADST, AOM_BITS_8,
+             16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_FLIPADST, AOM_BITS_8,
+             16)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x4HT,
                         ::testing::ValuesIn(kArrayHt4x4Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
 const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 0, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 0, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 1, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 1, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 2, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 2, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 3, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 3, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_ADST, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_ADST, 12),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 4, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 4, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 5, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 5, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 6, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 6, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 7, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 7, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 8, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 8, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_ADST, 12),
 #endif  // CONFIG_EXT_TX
 };
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans4x4HT,
                         ::testing::ValuesIn(kArrayHighbdHt4x4Param));
 
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
 
 }  // namespace
diff --git a/third_party/aom/test/av1_fht4x8_test.cc b/third_party/aom/test/av1_fht4x8_test.cc
index e447d8e2e..f9d2120e0 100644
--- a/third_party/aom/test/av1_fht4x8_test.cc
+++ b/third_party/aom/test/av1_fht4x8_test.cc
@@ -28,7 +28,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x8Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht4x8Param;
 
 void fht4x8_ref(const int16_t *in, tran_low_t *out, int stride,
                 TxfmParam *txfm_param) {
@@ -81,23 +81,26 @@ TEST_P(AV1Trans4x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 using std::tr1::make_tuple;
 
 const Ht4x8Param kArrayHt4x8Param_c[] = {
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 0, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 1, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 2, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 3, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_ADST, AOM_BITS_8, 32),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 4, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 5, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 6, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 7, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 8, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 9, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 10, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 11, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 12, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 13, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 14, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, 15, AOM_BITS_8, 32)
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_FLIPADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_ADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, IDTX, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_FLIPADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_FLIPADST, AOM_BITS_8, 32)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans4x8HT,
@@ -105,23 +108,34 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans4x8HT,
 
 #if HAVE_SSE2
 const Ht4x8Param kArrayHt4x8Param_sse2[] = {
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 0, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 1, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 2, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 3, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_DCT, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_DCT, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_ADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_ADST, AOM_BITS_8,
+             32),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 4, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 5, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 6, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 7, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 8, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 9, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 10, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 11, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 12, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 13, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 14, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 15, AOM_BITS_8, 32)
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, IDTX, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_FLIPADST, AOM_BITS_8,
+             32)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x8HT,
diff --git a/third_party/aom/test/av1_fht64x64_test.cc b/third_party/aom/test/av1_fht64x64_test.cc
index 61ea9f1f3..f2a03e7ee 100644
--- a/third_party/aom/test/av1_fht64x64_test.cc
+++ b/third_party/aom/test/av1_fht64x64_test.cc
@@ -29,7 +29,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht64x64Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht64x64Param;
 
 void fht64x64_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
@@ -82,23 +82,38 @@ TEST_P(AV1Trans64x64HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
 using std::tr1::make_tuple;
 
 const Ht64x64Param kArrayHt64x64Param_c[] = {
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 0, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 1, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 2, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 3, AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_DCT, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_DCT, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_ADST, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_ADST, AOM_BITS_8,
+             4096),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 4, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 5, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 6, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 7, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 8, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 9, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 10, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 11, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 12, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 13, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 14, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, 15, AOM_BITS_8, 4096)
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_DCT,
+             AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_FLIPADST,
+             AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_FLIPADST,
+             AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_FLIPADST,
+             AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_ADST,
+             AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, IDTX, AOM_BITS_8, 4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_DCT, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_DCT, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_ADST, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_ADST, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_FLIPADST, AOM_BITS_8,
+             4096),
+  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_FLIPADST, AOM_BITS_8,
+             4096)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans64x64HT,
diff --git a/third_party/aom/test/av1_fht8x16_test.cc b/third_party/aom/test/av1_fht8x16_test.cc
index 11f085885..689cb0b90 100644
--- a/third_party/aom/test/av1_fht8x16_test.cc
+++ b/third_party/aom/test/av1_fht8x16_test.cc
@@ -27,7 +27,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x16Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x16Param;
 
 void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride,
                  TxfmParam *txfm_param) {
@@ -80,23 +80,31 @@ TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 using std::tr1::make_tuple;
 
 const Ht8x16Param kArrayHt8x16Param_c[] = {
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 0, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 1, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 2, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 3, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_ADST, AOM_BITS_8,
+             128),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 4, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 5, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 6, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 7, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 8, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 9, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 10, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 11, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 12, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 13, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 14, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, 15, AOM_BITS_8, 128)
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, IDTX, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_DCT, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_ADST, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_FLIPADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_FLIPADST, AOM_BITS_8,
+             128)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans8x16HT,
@@ -104,23 +112,39 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans8x16HT,
 
 #if HAVE_SSE2
 const Ht8x16Param kArrayHt8x16Param_sse2[] = {
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 0, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 1, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 2, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 3, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_ADST,
+             AOM_BITS_8, 128),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 4, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 5, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 6, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 7, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 8, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 9, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 10, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 11, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 12, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 13, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 14, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 15, AOM_BITS_8, 128)
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, IDTX, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_DCT, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_ADST, AOM_BITS_8,
+             128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_FLIPADST,
+             AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_FLIPADST,
+             AOM_BITS_8, 128)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT,
diff --git a/third_party/aom/test/av1_fht8x4_test.cc b/third_party/aom/test/av1_fht8x4_test.cc
index c797421af..e50a69457 100644
--- a/third_party/aom/test/av1_fht8x4_test.cc
+++ b/third_party/aom/test/av1_fht8x4_test.cc
@@ -27,7 +27,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using std::tr1::tuple;
 using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x4Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x4Param;
 
 void fht8x4_ref(const int16_t *in, tran_low_t *out, int stride,
                 TxfmParam *txfm_param) {
@@ -80,23 +80,26 @@ TEST_P(AV1Trans8x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 using std::tr1::make_tuple;
 
 const Ht8x4Param kArrayHt8x4Param_c[] = {
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 0, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 1, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 2, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 3, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_ADST, AOM_BITS_8, 32),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 4, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 5, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 6, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 7, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 8, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 9, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 10, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 11, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 12, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 13, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 14, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, 15, AOM_BITS_8, 32)
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_FLIPADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_ADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, IDTX, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_FLIPADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_FLIPADST, AOM_BITS_8, 32)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(C, AV1Trans8x4HT,
@@ -104,23 +107,34 @@ INSTANTIATE_TEST_CASE_P(C, AV1Trans8x4HT,
 
 #if HAVE_SSE2
 const Ht8x4Param kArrayHt8x4Param_sse2[] = {
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 0, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 1, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 2, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 3, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_DCT, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_DCT, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_ADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_ADST, AOM_BITS_8,
+             32),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 4, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 5, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 6, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 7, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 8, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 9, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 10, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 11, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 12, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 13, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 14, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 15, AOM_BITS_8, 32)
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, IDTX, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_DCT, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_ADST, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_FLIPADST, AOM_BITS_8,
+             32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_FLIPADST, AOM_BITS_8,
+             32)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x4HT,
diff --git a/third_party/aom/test/av1_fht8x8_test.cc b/third_party/aom/test/av1_fht8x8_test.cc
index 259557cfb..499fcc338 100644
--- a/third_party/aom/test/av1_fht8x8_test.cc
+++ b/third_party/aom/test/av1_fht8x8_test.cc
@@ -29,7 +29,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
 
 using libaom_test::FhtFunc;
 using std::tr1::tuple;
-typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x8Param;
+typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x8Param;
 
 void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride,
                 TxfmParam *txfm_param) {
@@ -43,14 +43,14 @@ void iht8x8_ref(const tran_low_t *in, uint8_t *out, int stride,
 
 #if CONFIG_HIGHBITDEPTH
 typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           int tx_type, int bd);
+                           TX_TYPE tx_type, int bd);
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          int tx_type, int bd);
+                          TX_TYPE tx_type, int bd);
 // Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, int, int> HighbdHt8x8Param;
+typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt8x8Param;
 
-void highbd_fht8x8_ref(const int16_t *in, int32_t *out, int stride, int tx_type,
-                       int bd) {
+void highbd_fht8x8_ref(const int16_t *in, int32_t *out, int stride,
+                       TX_TYPE tx_type, int bd) {
   av1_fwd_txfm2d_8x8_c(in, out, stride, tx_type, bd);
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -130,7 +130,7 @@ class AV1HighbdTrans8x8HT : public ::testing::TestWithParam<HighbdHt8x8Param> {
  private:
   HbdHtFunc fwd_txfm_;
   HbdHtFunc fwd_txfm_ref_;
-  int tx_type_;
+  TX_TYPE tx_type_;
   int bit_depth_;
   int mask_;
   int num_coeffs_;
@@ -167,56 +167,67 @@ TEST_P(AV1HighbdTrans8x8HT, HighbdCoeffCheck) { RunBitexactCheck(); }
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_DAALA_DCT8
 const Ht8x8Param kArrayHt8x8Param_sse2[] = {
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 0, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 1, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 2, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 3, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_DCT, AOM_BITS_8,
+             64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_DCT, AOM_BITS_8,
+             64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_ADST, AOM_BITS_8,
+             64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_ADST, AOM_BITS_8,
+             64),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 4, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 5, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 9, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 13, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 14, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 15, AOM_BITS_8, 64)
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_DCT,
+             AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_FLIPADST,
+             AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_FLIPADST,
+             AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_FLIPADST,
+             AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_ADST,
+             AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, IDTX, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_DCT, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_DCT, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_ADST, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_ADST, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_FLIPADST, AOM_BITS_8,
+             64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_FLIPADST, AOM_BITS_8,
+             64)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x8HT,
                         ::testing::ValuesIn(kArrayHt8x8Param_sse2));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT8
 const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 0, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 0, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 1, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 1, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 2, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 2, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 3, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 3, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_ADST, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_ADST, 12),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 4, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 4, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 5, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 5, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 6, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 6, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 7, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 7, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 8, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 8, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_DCT, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_DCT, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_FLIPADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_FLIPADST, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_ADST, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_ADST, 12),
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans8x8HT,
                         ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT8
 
 }  // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm1d_test.cc b/third_party/aom/test/av1_fwd_txfm1d_test.cc
index b10e84d2c..9deef3c95 100644
--- a/third_party/aom/test/av1_fwd_txfm1d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm1d_test.cc
@@ -51,11 +51,6 @@ TEST(av1_fwd_txfm1d, round_shift) {
   EXPECT_EQ(round_shift(-8, 2), -2);
 }
 
-TEST(av1_fwd_txfm1d, get_max_bit) {
-  int max_bit = get_max_bit(8);
-  EXPECT_EQ(max_bit, 3);
-}
-
 TEST(av1_fwd_txfm1d, cospi_arr_data) {
   for (int i = 0; i < 7; i++) {
     for (int j = 0; j < 64; j++) {
@@ -65,31 +60,6 @@ TEST(av1_fwd_txfm1d, cospi_arr_data) {
   }
 }
 
-TEST(av1_fwd_txfm1d, clamp_block) {
-  int16_t block[5][5] = { { 7, -5, 6, -3, 9 },
-                          { 7, -5, 6, -3, 9 },
-                          { 7, -5, 6, -3, 9 },
-                          { 7, -5, 6, -3, 9 },
-                          { 7, -5, 6, -3, 9 } };
-
-  int16_t ref_block[5][5] = { { 7, -5, 6, -3, 9 },
-                              { 7, -5, 6, -3, 9 },
-                              { 7, -4, 2, -3, 9 },
-                              { 7, -4, 2, -3, 9 },
-                              { 7, -4, 2, -3, 9 } };
-
-  int row = 2;
-  int col = 1;
-  int block_size = 3;
-  int stride = 5;
-  clamp_block(block[row] + col, block_size, block_size, stride, -4, 2);
-  for (int r = 0; r < stride; r++) {
-    for (int c = 0; c < stride; c++) {
-      EXPECT_EQ(block[r][c], ref_block[r][c]);
-    }
-  }
-}
-
 TEST(av1_fwd_txfm1d, accuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int si = 0; si < txfm_size_num; ++si) {
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
index af3c8ff44..adf9a803c 100644
--- a/third_party/aom/test/av1_fwd_txfm2d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -177,5 +177,31 @@ const AV1FwdTxfm2dParam av1_fwd_txfm2d_param_c[] = {
 INSTANTIATE_TEST_CASE_P(C, AV1FwdTxfm2d,
                         ::testing::ValuesIn(av1_fwd_txfm2d_param_c));
 
+TEST(AV1FwdTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    // TODO(angiebird): include rect txfm in this test
+    for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(
+            static_cast<TX_TYPE>(tx_type), static_cast<TX_SIZE>(tx_size));
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd);
+        const TXFM_1D_CFG *col_cfg = cfg.col_cfg;
+        const TXFM_1D_CFG *row_cfg = cfg.row_cfg;
+        libaom_test::txfm_stage_range_check(stage_range_col, col_cfg->stage_num,
+                                            col_cfg->cos_bit, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, row_cfg->stage_num,
+                                            row_cfg->cos_bit, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
+
 #endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/av1_highbd_iht_test.cc b/third_party/aom/test/av1_highbd_iht_test.cc
index 3b263638f..45df5ed84 100644
--- a/third_party/aom/test/av1_highbd_iht_test.cc
+++ b/third_party/aom/test/av1_highbd_iht_test.cc
@@ -26,10 +26,10 @@ using std::tr1::tuple;
 using libaom_test::ACMRandom;
 
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          int tx_type, int bd);
+                          TX_TYPE tx_type, int bd);
 
 typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
-                           int tx_type, int bd);
+                           TX_TYPE tx_type, int bd);
 
 // Test parameter argument list:
 //   <transform reference function,
@@ -38,7 +38,7 @@ typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
 //    num_coeffs,
 //    tx_type,
 //    bit_depth>
-typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, int, int> IHbdHtParam;
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, TX_TYPE, int> IHbdHtParam;
 
 class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
  public:
@@ -97,7 +97,7 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
   IHbdHtFunc inv_txfm_;
   IHbdHtFunc inv_txfm_ref_;
   int num_coeffs_;
-  int tx_type_;
+  TX_TYPE tx_type_;
   int bit_depth_;
 
   int16_t *input_;
@@ -135,21 +135,26 @@ TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && \
+    !(CONFIG_DAALA_DCT4 && CONFIG_DAALA_DCT8 && CONFIG_DAALA_DCT16)
+#if !CONFIG_DAALA_DCT4
 #define PARAM_LIST_4X4                                   \
   &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
       &av1_inv_txfm2d_add_4x4_c, 16
-
+#endif
+#if !CONFIG_DAALA_DCT8
 #define PARAM_LIST_8X8                                   \
   &av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
       &av1_inv_txfm2d_add_8x8_c, 64
-
+#endif
+#if !CONFIG_DAALA_DCT16
 #define PARAM_LIST_16X16                                     \
   &av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
       &av1_inv_txfm2d_add_16x16_c, 256
-
+#endif
 const IHbdHtParam kArrayIhtParam[] = {
-  // 16x16
+// 16x16
+#if !CONFIG_DAALA_DCT16
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
   make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
@@ -170,7 +175,9 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
 #endif
-  // 8x8
+#endif
+// 8x8
+#if !CONFIG_DAALA_DCT8
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
   make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
@@ -191,7 +198,9 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
 #endif
-  // 4x4
+#endif
+// 4x4
+#if !CONFIG_DAALA_DCT4
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
   make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
@@ -212,13 +221,15 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
 #endif
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
                         ::testing::ValuesIn(kArrayIhtParam));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH &&
+        //  !(CONFIG_DAALA_DCT4 && CONFIG_DAALA_DCT8 && CONFIG_DAALA_DCT16)
 
-#if HAVE_AVX2 && CONFIG_HIGHBITDEPTH
+#if HAVE_AVX2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT32
 #define PARAM_LIST_32X32                                   \
   &av1_fwd_txfm2d_32x32_c, &av1_inv_txfm2d_add_32x32_avx2, \
       &av1_inv_txfm2d_add_32x32_c, 1024
diff --git a/third_party/aom/test/av1_inv_txfm1d_test.cc b/third_party/aom/test/av1_inv_txfm1d_test.cc
index b871105eb..b44c04116 100644
--- a/third_party/aom/test/av1_inv_txfm1d_test.cc
+++ b/third_party/aom/test/av1_inv_txfm1d_test.cc
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <math.h>
+
 #include "test/av1_txfm_test.h"
 #include "test/util.h"
 #include "av1/common/av1_fwd_txfm1d.h"
@@ -45,6 +47,68 @@ const TxfmFunc inv_txfm_func_ls[][2] = {
 const int8_t cos_bit[12] = { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 };
 const int8_t range_bit[12] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 };
 
+void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
+  double input[64];
+  for (int i = 0; i < size; ++i) input[i] = in[i];
+
+  double output[64];
+  libaom_test::reference_idct_1d(input, output, size);
+
+  for (int i = 0; i < size; ++i)
+    out[i] = static_cast<int32_t>(round(output[i]));
+}
+
+void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
+  const int bits = 16;
+  const int maxVal = (1 << (bits - 1)) - 1;
+  const int minVal = -(1 << (bits - 1));
+  for (int i = 0; i < len; ++i) {
+    if (rnd->Rand8() % 10)
+      dst[i] = minVal + rnd->Rand16() % (1 << bits);
+    else
+      dst[i] = rnd->Rand8() % 2 ? minVal : maxVal;
+  }
+}
+
+TEST(av1_inv_txfm1d, InvAccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 20000;
+  const int max_error[] = { 6, 10, 19, 28 };
+  for (int k = 0; k < count_test_block; ++k) {
+    // choose a random transform to test
+    const int txfm_type = rnd.Rand8() % NELEMENTS(inv_txfm_func_ls);
+    const int txfm_size = txfm_size_ls[txfm_type];
+    const TxfmFunc txfm_func = inv_txfm_func_ls[txfm_type][0];
+
+    int32_t input[64];
+    random_matrix(input, txfm_size, &rnd);
+
+    int32_t ref_output[64];
+    reference_idct_1d_int(input, ref_output, txfm_size);
+
+    int32_t output[64];
+    txfm_func(input, output, cos_bit, range_bit);
+
+    for (int i = 0; i < txfm_size; ++i) {
+      EXPECT_LE(abs(output[i] - ref_output[i]), max_error[txfm_type]);
+    }
+  }
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+TEST(av1_inv_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
 TEST(av1_inv_txfm1d, round_trip) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int si = 0; si < NELEMENTS(fwd_txfm_func_ls); ++si) {
diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc
index 5185c1ca8..bccbdeebf 100644
--- a/third_party/aom/test/av1_inv_txfm2d_test.cc
+++ b/third_party/aom/test/av1_inv_txfm2d_test.cc
@@ -40,11 +40,12 @@ class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
     tx_size_ = GET_PARAM(1);
     max_error_ = GET_PARAM(2);
     max_avg_error_ = GET_PARAM(3);
-    txfm1d_size_ = libaom_test::get_txfm1d_size(tx_size_);
-    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
   }
 
   void RunRoundtripCheck() {
+    int tx_w = tx_size_wide[tx_size_];
+    int tx_h = tx_size_high[tx_size_];
+    int txfm2d_size = tx_w * tx_h;
     const Fwd_Txfm2d_Func fwd_txfm_func =
         libaom_test::fwd_txfm_func_ls[tx_size_];
     const Inv_Txfm2d_Func inv_txfm_func =
@@ -56,9 +57,9 @@ class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
 
     for (int ci = 0; ci < count; ci++) {
       int16_t expected[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size_, NELEMENTS(expected));
+      ASSERT_LT(txfm2d_size, NELEMENTS(expected));
 
-      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
         if (ci == 0) {
           int extreme_input = input_base - 1;
           expected[ni] = extreme_input;  // extreme case
@@ -68,25 +69,26 @@ class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
       }
 
       int32_t coeffs[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size_, NELEMENTS(coeffs));
-      fwd_txfm_func(expected, coeffs, txfm1d_size_, tx_type_, bd);
+      ASSERT_LT(txfm2d_size, NELEMENTS(coeffs));
+      fwd_txfm_func(expected, coeffs, tx_w, tx_type_, bd);
 
       uint16_t actual[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size_, NELEMENTS(actual));
-      inv_txfm_func(coeffs, actual, txfm1d_size_, tx_type_, bd);
+      ASSERT_LT(txfm2d_size, NELEMENTS(actual));
+      inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
 
-      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
         EXPECT_GE(max_error_, abs(expected[ni] - actual[ni]));
       }
       avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
-          expected, actual, txfm2d_size_);
+          expected, actual, txfm2d_size);
     }
 
     avg_abs_error /= count;
     // max_abs_avg_error comes from upper bound of
     // printf("txfm1d_size: %d accuracy_avg_abs_error: %f\n",
     // txfm1d_size_, avg_abs_error);
-    EXPECT_GE(max_avg_error_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
   }
 
  private:
@@ -94,14 +96,53 @@ class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
   double max_avg_error_;
   TX_TYPE tx_type_;
   TX_SIZE tx_size_;
-  int txfm1d_size_;
-  int txfm2d_size_;
 };
 
 TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
 
 const AV1InvTxfm2dParam av1_inv_txfm2d_param[] = {
 #if CONFIG_EXT_TX
+#if CONFIG_RECT_TX
+  AV1InvTxfm2dParam(DCT_DCT, TX_4X8, 2, 0.007),
+  AV1InvTxfm2dParam(ADST_DCT, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(DCT_ADST, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(ADST_ADST, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_4X8, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_4X8, 2, 0.012),
+
+  AV1InvTxfm2dParam(DCT_DCT, TX_8X4, 2, 0.007),
+  AV1InvTxfm2dParam(ADST_DCT, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(DCT_ADST, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(ADST_ADST, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X4, 2, 0.007),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X4, 2, 0.012),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X4, 2, 0.012),
+
+  AV1InvTxfm2dParam(DCT_DCT, TX_8X16, 2, 0.025),
+  AV1InvTxfm2dParam(ADST_DCT, TX_8X16, 2, 0.020),
+  AV1InvTxfm2dParam(DCT_ADST, TX_8X16, 2, 0.027),
+  AV1InvTxfm2dParam(ADST_ADST, TX_8X16, 2, 0.023),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X16, 2, 0.020),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X16, 2, 0.027),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X16, 2, 0.032),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X16, 2, 0.023),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X16, 2, 0.023),
+
+  AV1InvTxfm2dParam(DCT_DCT, TX_16X8, 2, 0.007),
+  AV1InvTxfm2dParam(ADST_DCT, TX_16X8, 2, 0.012),
+  AV1InvTxfm2dParam(DCT_ADST, TX_16X8, 2, 0.024),
+  AV1InvTxfm2dParam(ADST_ADST, TX_16X8, 2, 0.033),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_16X8, 2, 0.015),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_16X8, 2, 0.032),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X8, 2, 0.032),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_16X8, 2, 0.033),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_16X8, 2, 0.032),
+#endif
   AV1InvTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.002),
   AV1InvTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.002),
   AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.002),
@@ -144,6 +185,32 @@ const AV1InvTxfm2dParam av1_inv_txfm2d_param[] = {
 INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm2d,
                         ::testing::ValuesIn(av1_inv_txfm2d_param));
 
+TEST(AV1InvTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    // TODO(angiebird): include rect txfm in this test
+    for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(
+            static_cast<TX_TYPE>(tx_type), static_cast<TX_SIZE>(tx_size));
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
+                                fwd_shift_sum[tx_size], bd);
+        const TXFM_1D_CFG *col_cfg = cfg.col_cfg;
+        const TXFM_1D_CFG *row_cfg = cfg.row_cfg;
+        libaom_test::txfm_stage_range_check(stage_range_col, col_cfg->stage_num,
+                                            col_cfg->cos_bit, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, row_cfg->stage_num,
+                                            row_cfg->cos_bit, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
 #endif  // CONFIG_HIGHBITDEPTH
 
 }  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm_test.cc b/third_party/aom/test/av1_inv_txfm_test.cc
index 34d45e08b..873e80685 100644
--- a/third_party/aom/test/av1_inv_txfm_test.cc
+++ b/third_party/aom/test/av1_inv_txfm_test.cc
@@ -18,6 +18,7 @@
 #include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -29,19 +30,6 @@
 using libaom_test::ACMRandom;
 
 namespace {
-const double kInvSqrt2 = 0.707106781186547524400844362104;
-
-void reference_idct_1d(const double *in, double *out, int size) {
-  for (int n = 0; n < size; ++n) {
-    out[n] = 0;
-    for (int k = 0; k < size; ++k) {
-      if (k == 0)
-        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
-      else
-        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
-    }
-  }
-}
 
 typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
 
@@ -65,7 +53,7 @@ class TransTestBase {
       }
 
       inv_txfm_(input, output);
-      reference_idct_1d(ref_input, ref_output, txfm_size_);
+      libaom_test::reference_idct_1d(ref_input, ref_output, txfm_size_);
 
       for (int ni = 0; ni < txfm_size_; ++ni) {
         EXPECT_LE(
diff --git a/third_party/aom/test/av1_quantize_test.cc b/third_party/aom/test/av1_quantize_test.cc
index 239b041b2..36ac8c4ad 100644
--- a/third_party/aom/test/av1_quantize_test.cc
+++ b/third_party/aom/test/av1_quantize_test.cc
@@ -99,8 +99,8 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
       for (int j = 0; j < count; ++j) {
         err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
                      (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j]) << "qcoeff error: i = " << i
-                                                    << " j = " << j << "\n";
+        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+            << "qcoeff error: i = " << i << " j = " << j << "\n";
         EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
             << "dqcoeff error: i = " << i << " j = " << j << "\n";
       }
@@ -195,7 +195,6 @@ TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
 #if HAVE_SSE4_1
-#if !CONFIG_AOM_QM
 const QuantizeFuncParams qfps[4] = {
   QuantizeFuncParams(av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
                      16),
@@ -208,6 +207,5 @@ const QuantizeFuncParams qfps[4] = {
 };
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
-#endif  // !CONFIG_AOM_QM
 #endif  // HAVE_SSE4_1
 }  // namespace
diff --git a/third_party/aom/test/av1_txfm_test.cc b/third_party/aom/test/av1_txfm_test.cc
index 1e473b304..4545de100 100644
--- a/third_party/aom/test/av1_txfm_test.cc
+++ b/third_party/aom/test/av1_txfm_test.cc
@@ -66,16 +66,32 @@ void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
 
 double invSqrt2 = 1 / pow(2, 0.5);
 
+double dct_matrix(double n, double k, int size) {
+  return cos(M_PI * (2 * n + 1) * k / (2 * size));
+}
+
 void reference_dct_1d(const double *in, double *out, int size) {
   for (int k = 0; k < size; ++k) {
     out[k] = 0;
     for (int n = 0; n < size; ++n) {
-      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
+      out[k] += in[n] * dct_matrix(n, k, size);
     }
     if (k == 0) out[k] = out[k] * invSqrt2;
   }
 }
 
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      if (n == 0)
+        out[k] += invSqrt2 * in[n] * dct_matrix(k, n, size);
+      else
+        out[k] += in[n] * dct_matrix(k, n, size);
+    }
+  }
+}
+
 void reference_adst_1d(const double *in, double *out, int size) {
   for (int k = 0; k < size; ++k) {
     out[k] = 0;
@@ -161,4 +177,20 @@ template void fliplr<double>(double *dest, int stride, int length);
 template void flipud<double>(double *dest, int stride, int length);
 template void fliplrud<double>(double *dest, int stride, int length);
 
+int bd_arr[BD_NUM] = { 8, 10, 12 };
+int8_t low_range_arr[BD_NUM] = { 16, 32, 32 };
+int8_t high_range_arr[BD_NUM] = { 32, 32, 32 };
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            const int8_t *cos_bit, int low_range,
+                            int high_range) {
+  for (int i = 0; i < stage_num; ++i) {
+    EXPECT_LE(stage_range[i], low_range);
+  }
+  for (int i = 0; i < stage_num - 1; ++i) {
+    // make sure there is no overflow while doing half_btf()
+    EXPECT_LE(stage_range[i] + cos_bit[i], high_range);
+    EXPECT_LE(stage_range[i + 1] + cos_bit[i], high_range);
+  }
+}
 }  // namespace libaom_test
diff --git a/third_party/aom/test/av1_txfm_test.h b/third_party/aom/test/av1_txfm_test.h
index d46f0bba7..3e64e36ad 100644
--- a/third_party/aom/test/av1_txfm_test.h
+++ b/third_party/aom/test/av1_txfm_test.h
@@ -40,6 +40,7 @@ int get_txfm1d_size(TX_SIZE tx_size);
 void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1);
 
 void reference_dct_1d(const double *in, double *out, int size);
+void reference_idct_1d(const double *in, double *out, int size);
 
 void reference_adst_1d(const double *in, double *out, int size);
 
@@ -70,31 +71,79 @@ void fliplrud(Type *dest, int stride, int length);
 typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t *cos_bit,
                          const int8_t *range_bit);
 
-typedef void (*Fwd_Txfm2d_Func)(const int16_t *, int32_t *, int, int, int);
-typedef void (*Inv_Txfm2d_Func)(const int32_t *, uint16_t *, int, int, int);
+typedef void (*Fwd_Txfm2d_Func)(const int16_t *, int32_t *, int, TX_TYPE, int);
+typedef void (*Inv_Txfm2d_Func)(const int32_t *, uint16_t *, int, TX_TYPE, int);
 
 static const int bd = 10;
 static const int input_base = (1 << bd);
 
 #if CONFIG_HIGHBITDEPTH
 #if CONFIG_AV1_ENCODER
-static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES] = {
+
+static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES_ALL] = {
 #if CONFIG_CHROMA_2X2
   NULL,
 #endif
-  av1_fwd_txfm2d_4x4_c, av1_fwd_txfm2d_8x8_c, av1_fwd_txfm2d_16x16_c,
-  av1_fwd_txfm2d_32x32_c
+  av1_fwd_txfm2d_4x4_c,
+  av1_fwd_txfm2d_8x8_c,
+  av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c,
+#if CONFIG_TX64X64
+  av1_fwd_txfm2d_64x64_c,
+#endif  // CONFIG_TX64X64
+  av1_fwd_txfm2d_4x8_c,
+  av1_fwd_txfm2d_8x4_c,
+  av1_fwd_txfm2d_8x16_c,
+  av1_fwd_txfm2d_16x8_c,
+  av1_fwd_txfm2d_16x32_c,
+  av1_fwd_txfm2d_32x16_c,
+#if CONFIG_TX64X64
+  av1_fwd_txfm2d_32x64_c,
+  av1_fwd_txfm2d_64x32_c,
+#endif  // CONFIG_TX64X64
+  NULL,
+  NULL,
+  NULL,
+  NULL,
 };
 #endif
 
-static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES] = {
+static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES_ALL] = {
 #if CONFIG_CHROMA_2X2
   NULL,
 #endif
-  av1_inv_txfm2d_add_4x4_c, av1_inv_txfm2d_add_8x8_c,
-  av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c
+  av1_inv_txfm2d_add_4x4_c,
+  av1_inv_txfm2d_add_8x8_c,
+  av1_inv_txfm2d_add_16x16_c,
+  av1_inv_txfm2d_add_32x32_c,
+#if CONFIG_TX64X64
+  av1_inv_txfm2d_add_64x64_c,
+#endif  // CONFIG_TX64X64
+  av1_inv_txfm2d_add_4x8_c,
+  av1_inv_txfm2d_add_8x4_c,
+  av1_inv_txfm2d_add_8x16_c,
+  av1_inv_txfm2d_add_16x8_c,
+  av1_inv_txfm2d_add_16x32_c,
+  av1_inv_txfm2d_add_32x16_c,
+#if CONFIG_TX64X64
+  av1_inv_txfm2d_add_32x64_c,
+  av1_inv_txfm2d_add_64x32_c,
+#endif  // CONFIG_TX64X64
+  NULL,
+  NULL,
+  NULL,
+  NULL,
 };
 #endif  // CONFIG_HIGHBITDEPTH
 
+#define BD_NUM 3
+
+extern int bd_arr[];
+extern int8_t low_range_arr[];
+extern int8_t high_range_arr[];
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            const int8_t *cos_bit, int low_range,
+                            int high_range);
 }  // namespace libaom_test
 #endif  // AV1_TXFM_TEST_H_
diff --git a/third_party/aom/test/boolcoder_test.cc b/third_party/aom/test/boolcoder_test.cc
index 7abe1b1b6..916a54427 100644
--- a/third_party/aom/test/boolcoder_test.cc
+++ b/third_party/aom/test/boolcoder_test.cc
@@ -86,11 +86,7 @@ TEST(AV1, TestBitIO) {
   }
 }
 
-#if CONFIG_EC_SMALLMUL
 #define FRAC_DIFF_TOTAL_ERROR 0.16
-#else
-#define FRAC_DIFF_TOTAL_ERROR 0.07
-#endif
 
 TEST(AV1, TestTell) {
   const int kBufferSize = 10000;
@@ -116,8 +112,8 @@ TEST(AV1, TestTell) {
       aom_read(&br, p, NULL);
       uint32_t tell = aom_reader_tell(&br);
       uint32_t tell_frac = aom_reader_tell_frac(&br);
-      GTEST_ASSERT_GE(tell, last_tell) << "tell: " << tell
-                                       << ", last_tell: " << last_tell;
+      GTEST_ASSERT_GE(tell, last_tell)
+          << "tell: " << tell << ", last_tell: " << last_tell;
       GTEST_ASSERT_GE(tell_frac, last_tell_frac)
           << "tell_frac: " << tell_frac
           << ", last_tell_frac: " << last_tell_frac;
diff --git a/third_party/aom/test/cdef_test.cc b/third_party/aom/test/cdef_test.cc
new file mode 100644
index 000000000..b6250b6e9
--- /dev/null
+++ b/third_party/aom/test/cdef_test.cc
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+*/
+
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef std::tr1::tuple<cdef_filter_block_func, cdef_filter_block_func, int>
+    cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+  virtual ~CDEFBlockTest() {}
+  virtual void SetUp() {
+    cdef = GET_PARAM(0);
+    ref_cdef = GET_PARAM(1);
+    bsize = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bsize;
+  cdef_filter_block_func cdef;
+  cdef_filter_block_func ref_cdef;
+};
+
+typedef CDEFBlockTest CDEFSpeedTest;
+
+void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
+               cdef_filter_block_func ref_cdef) {
+  const int size = 8;
+  const int ysize = size + 2 * CDEF_VBORDER;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+  DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+  DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  int error = 0, pristrength = 0, secstrength, dir;
+  int boundary, pridamping, secdamping, depth, bits, level, count,
+      errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+      errpridamping = 0, errsecdamping = 0;
+  unsigned int pos = 0;
+
+  for (boundary = 0; boundary < 16; boundary++) {
+    for (depth = 8; depth <= 12; depth += 2) {
+      const unsigned int max_pos = size * size >> (depth == 8);
+      for (pridamping = 3 + depth - 8;
+           pridamping < 7 - 3 * !!boundary + depth - 8; pridamping++) {
+        for (secdamping = 3 + depth - 8;
+             secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+          for (count = 0; count < iterations; count++) {
+            for (level = 0; level < (1 << depth) && !error;
+                 level += (2 + 6 * !!boundary) << (depth - 8)) {
+              for (bits = 1; bits <= depth && !error;
+                   bits += 1 + 3 * !!boundary) {
+                for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+                  s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                               (1 << depth) - 1);
+                if (boundary) {
+                  if (boundary & 1) {  // Left
+                    for (int i = 0; i < ysize; i++)
+                      for (int j = 0; j < CDEF_HBORDER; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 2) {  // Right
+                    for (int i = 0; i < ysize; i++)
+                      for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 4) {  // Above
+                    for (int i = 0; i < CDEF_VBORDER; i++)
+                      for (int j = 0; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 8) {  // Below
+                    for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                      for (int j = 0; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                }
+                for (dir = 0; dir < 8; dir++) {
+                  for (pristrength = 0;
+                       pristrength <= 19 << (depth - 8) && !error;
+                       pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+                    if (pristrength == 16) pristrength = 19;
+                    for (secstrength = 0;
+                         secstrength <= 4 << (depth - 8) && !error;
+                         secstrength += 1 << (depth - 8)) {
+                      if (secstrength == 3 << (depth - 8)) continue;
+                      ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+                               s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                               pristrength, secstrength, dir, pridamping,
+                               secdamping, bsize, (1 << depth) - 1);
+                      // If cdef and ref_cdef are the same, we're just testing
+                      // speed
+                      if (cdef != ref_cdef)
+                        ASM_REGISTER_STATE_CHECK(
+                            cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+                                 s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                                 pristrength, secstrength, dir, pridamping,
+                                 secdamping, bsize, (1 << depth) - 1));
+                      if (ref_cdef != cdef) {
+                        for (pos = 0; pos < max_pos && !error; pos++) {
+                          error = ref_d[pos] != d[pos];
+                          errdepth = depth;
+                          errpristrength = pristrength;
+                          errsecstrength = secstrength;
+                          errboundary = boundary;
+                          errpridamping = pridamping;
+                          errsecdamping = secdamping;
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  pos--;
+  EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+                      << std::endl
+                      << "First error at " << pos % size << "," << pos / size
+                      << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+                      << ") " << std::endl
+                      << "pristrength: " << errpristrength << std::endl
+                      << "pridamping: " << errpridamping << std::endl
+                      << "secstrength: " << errsecstrength << std::endl
+                      << "secdamping: " << errsecdamping << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << "size: " << bsize << std::endl
+                      << "boundary: " << errboundary << std::endl
+                      << std::endl;
+}
+
+void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
+                     cdef_filter_block_func ref_cdef) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_cdef(bsize, iterations, ref_cdef, ref_cdef);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_cdef(bsize, iterations, cdef, cdef);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
+            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+                          int coeff_shift);
+
+typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+  virtual ~CDEFFindDirTest() {}
+  virtual void SetUp() {
+    finddir = GET_PARAM(0);
+    ref_finddir = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  find_dir_t finddir;
+  find_dir_t ref_finddir;
+};
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+                                 int coeff_shift),
+                  int (*ref_finddir)(const uint16_t *img, int stride,
+                                     int32_t *var, int coeff_shift)) {
+  const int size = 8;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+  int error = 0;
+  int depth, bits, level, count, errdepth = 0;
+  int ref_res = 0, res = 0;
+  int32_t ref_var = 0, var = 0;
+
+  for (depth = 8; depth <= 12 && !error; depth += 2) {
+    for (count = 0; count < 512 && !error; count++) {
+      for (level = 0; level < (1 << depth) && !error;
+           level += 1 << (depth - 8)) {
+        for (bits = 1; bits <= depth && !error; bits++) {
+          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                         (1 << depth) - 1);
+          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+            ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+          if (finddir != ref_finddir)
+            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+          if (ref_finddir != finddir) {
+            if (res != ref_res || var != ref_var) error = 1;
+            errdepth = depth;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+                      << std::endl
+                      << "return: " << res << " : " << ref_res << std::endl
+                      << "var: " << var << " : " << ref_var << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+                                       int32_t *var, int coeff_shift),
+                        int (*ref_finddir)(const uint16_t *img, int stride,
+                                           int32_t *var, int coeff_shift)) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_finddir(ref_finddir, ref_finddir);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_finddir(finddir, finddir);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
+            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+  test_cdef(bsize, 1, cdef, ref_cdef);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+  test_cdef_speed(bsize, 4, cdef, ref_cdef);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+  test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+  test_finddir_speed(finddir, ref_finddir);
+}
+
+using std::tr1::make_tuple;
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFBlockTest,
+    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_4X4),
+                      make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFSpeedTest,
+    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_4X4),
+                      make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#endif  // defined(_WIN64) || !defined(_MSC_VER)
+}  // namespace
diff --git a/third_party/aom/test/clpf_test.cc b/third_party/aom/test/clpf_test.cc
index 2c0f8cf7f..ecb042876 100644
--- a/third_party/aom/test/clpf_test.cc
+++ b/third_party/aom/test/clpf_test.cc
@@ -17,7 +17,7 @@
 #include "./aom_config.h"
 #include "./av1_rtcd.h"
 #include "aom_ports/aom_timer.h"
-#include "av1/common/od_dering.h"
+#include "av1/common/cdef_block.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -128,22 +128,22 @@ void test_clpf(int w, int h, unsigned int depth, unsigned int iterations,
               if (boundary & 1) {  // Left
                 for (int i = 0; i < size; i++)
                   for (int j = 0; j < xpos; j++)
-                    s[i * size + j] = OD_DERING_VERY_LARGE;
+                    s[i * size + j] = CDEF_VERY_LARGE;
               }
               if (boundary & 2) {  // Right
                 for (int i = 0; i < size; i++)
                   for (int j = xpos + w; j < size; j++)
-                    s[i * size + j] = OD_DERING_VERY_LARGE;
+                    s[i * size + j] = CDEF_VERY_LARGE;
               }
               if (boundary & 4) {  // Above
                 for (int i = 0; i < ypos; i++)
                   for (int j = 0; j < size; j++)
-                    s[i * size + j] = OD_DERING_VERY_LARGE;
+                    s[i * size + j] = CDEF_VERY_LARGE;
               }
               if (boundary & 8) {  // Below
                 for (int i = ypos + h; i < size; i++)
                   for (int j = 0; j < size; j++)
-                    s[i * size + j] = OD_DERING_VERY_LARGE;
+                    s[i * size + j] = CDEF_VERY_LARGE;
               }
             }
             for (strength = depth - 8; strength < depth - 5 && !error;
diff --git a/third_party/aom/test/coding_path_sync.cc b/third_party/aom/test/coding_path_sync.cc
index 68ee160bf..5b6409d03 100644
--- a/third_party/aom/test/coding_path_sync.cc
+++ b/third_party/aom/test/coding_path_sync.cc
@@ -15,8 +15,6 @@
 
 #include "./aom_config.h"
 
-#if CONFIG_AV1_ENCODER && CONFIG_AV1_DECODER
-
 #include "aom_ports/mem.h"  // ROUND_POWER_OF_TWO
 #include "aom/aomcx.h"
 #include "aom/aomdx.h"
@@ -26,9 +24,9 @@
 using libaom_test::ACMRandom;
 namespace {
 
-struct CompressedSource {
-  explicit CompressedSource(int seed) : rnd_(seed) {
-    frame_count_ = 0;
+class CompressedSource {
+ public:
+  explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
     aom_codec_iface_t *algo = &aom_codec_av1_cx_algo;
 
     aom_codec_enc_cfg_t cfg;
@@ -39,8 +37,15 @@ struct CompressedSource {
     cfg.rc_end_usage = AOM_CQ;
     cfg.rc_max_quantizer = max_q;
     cfg.rc_min_quantizer = max_q;
-    cfg.g_w = WIDTH;
-    cfg.g_h = HEIGHT;
+
+    // choose the picture size
+    {
+      width_ = rnd_.PseudoUniform(kWidth - 8) + 8;
+      height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
+    }
+
+    cfg.g_w = width_;
+    cfg.g_h = height_;
     cfg.g_lag_in_frames = 0;
 
     aom_codec_enc_init(&enc_, algo, &cfg, 0);
@@ -48,8 +53,8 @@ struct CompressedSource {
 
   ~CompressedSource() { aom_codec_destroy(&enc_); }
 
-  const aom_codec_cx_pkt_t *readFrame() {
-    uint8_t buf[WIDTH * HEIGHT * 3 / 2] = { 0 };
+  const aom_codec_cx_pkt_t *ReadFrame() {
+    uint8_t buf[kWidth * kHeight * 3 / 2] = { 0 };
 
     // render regular pattern
     const int period = rnd_.Rand8() % 32 + 1;
@@ -57,52 +62,63 @@ struct CompressedSource {
 
     const int val_a = rnd_.Rand8();
     const int val_b = rnd_.Rand8();
+
     for (int i = 0; i < (int)sizeof buf; ++i)
       buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
 
     aom_image_t img;
-    aom_img_wrap(&img, AOM_IMG_FMT_I420, WIDTH, HEIGHT, 0, buf);
+    aom_img_wrap(&img, AOM_IMG_FMT_I420, width_, height_, 0, buf);
     aom_codec_encode(&enc_, &img, frame_count_++, 1, 0, 0);
 
     aom_codec_iter_t iter = NULL;
-    return aom_codec_get_cx_data(&enc_, &iter);
+
+    const aom_codec_cx_pkt_t *pkt = NULL;
+
+    do {
+      pkt = aom_codec_get_cx_data(&enc_, &iter);
+    } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT);
+
+    return pkt;
   }
 
  private:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+
   ACMRandom rnd_;
   aom_codec_ctx_t enc_;
   int frame_count_;
-  static const int WIDTH = 32;
-  static const int HEIGHT = 32;
+  int width_, height_;
 };
 
 // lowers an aom_image_t to a easily comparable/printable form
-std::vector<int16_t> serialize(const aom_image_t *img) {
-  const int w_uv = ROUND_POWER_OF_TWO(img->d_w, img->x_chroma_shift);
-  const int h_uv = ROUND_POWER_OF_TWO(img->d_h, img->y_chroma_shift);
-  const int w[] = { (int)img->d_w, w_uv, w_uv };
-  const int h[] = { (int)img->d_h, h_uv, h_uv };
-
+std::vector<int16_t> Serialize(const aom_image_t *img) {
   std::vector<int16_t> bytes;
   bytes.reserve(img->d_w * img->d_h * 3);
-  for (int plane = 0; plane < 3; ++plane)
-    for (int r = 0; r < h[plane]; ++r)
-      for (int c = 0; c < w[plane]; ++c) {
-        const int offset = r * img->stride[plane] + c;
+  for (int plane = 0; plane < 3; ++plane) {
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        unsigned char *row = img->planes[plane] + r * img->stride[plane];
         if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-          bytes.push_back(img->planes[plane][offset * 2]);
+          bytes.push_back(row[c * 2]);
         else
-          bytes.push_back(img->planes[plane][offset]);
+          bytes.push_back(row[c]);
       }
+    }
+  }
 
   return bytes;
 }
 
-struct Decoder {
+class Decoder {
+ public:
   explicit Decoder(int allowLowbitdepth) {
     aom_codec_iface_t *algo = &aom_codec_av1_dx_algo;
 
-    aom_codec_dec_cfg cfg = { 0 };
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.allow_lowbitdepth = allowLowbitdepth;
 
     aom_codec_dec_init(&dec_, algo, &cfg, 0);
@@ -111,11 +127,11 @@ struct Decoder {
   ~Decoder() { aom_codec_destroy(&dec_); }
 
   std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
-    aom_codec_decode(&dec_, (uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz,
-                     NULL, 0);
+    aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
+                     static_cast<unsigned int>(pkt->data.frame.sz), NULL, 0);
 
     aom_codec_iter_t iter = NULL;
-    return serialize(aom_codec_get_frame(&dec_, &iter));
+    return Serialize(aom_codec_get_frame(&dec_, &iter));
   }
 
  private:
@@ -124,22 +140,19 @@ struct Decoder {
 
 // Try to reveal a mismatch between LBD and HBD coding paths.
 TEST(CodingPathSync, SearchForHbdLbdMismatch) {
-  // disable test. Re-enable it locally to help diagnosing LBD/HBD mismatches.
-  // And re-enable it once both coding paths match
-  // so they don't diverge anymore.
-  return;
-
   const int count_tests = 100;
   for (int i = 0; i < count_tests; ++i) {
-    Decoder dec_HBD(0);
-    Decoder dec_LBD(1);
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
 
     CompressedSource enc(i);
-    const aom_codec_cx_pkt_t *frame = enc.readFrame();
-    ASSERT_EQ(dec_LBD.decode(frame), dec_HBD.decode(frame));
+    const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+    std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+    std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+    ASSERT_EQ(lbd_yuv, hbd_yuv);
   }
 }
 
 }  // namespace
-
-#endif
diff --git a/third_party/aom/test/convolve_round_test.cc b/third_party/aom/test/convolve_round_test.cc
index 6f77dbb80..4976b03c8 100644
--- a/third_party/aom/test/convolve_round_test.cc
+++ b/third_party/aom/test/convolve_round_test.cc
@@ -12,13 +12,13 @@
 #include <assert.h>
 
 #include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/aom_timer.h"
 
 using libaom_test::ACMRandom;
 
@@ -68,12 +68,18 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
   virtual void SetUp() {
     const size_t block_size = 128 * 128;
     src_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, 3 * block_size * sizeof(int32_t)));
-    dst_ref_ = reinterpret_cast<uint16_t *>(src_ + block_size);
-    dst_ = dst_ref_ + block_size;
+        aom_memalign(16, block_size * sizeof(*src_)));
+    dst_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_ref_)));
+    dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_)));
   }
 
-  virtual void TearDown() { aom_free(src_); }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_ref_);
+    aom_free(dst_);
+  }
 
   void ConvolveRoundingRun() {
     int test_num = 0;
@@ -82,7 +88,6 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
     int bits = 13;
     uint8_t *dst = 0;
     uint8_t *dst_ref = 0;
-    int diff_wide;
 
     if (data_path_ == LOWBITDEPTH_TEST) {
       dst = reinterpret_cast<uint8_t *>(dst_);
@@ -109,14 +114,24 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
       GenerateBufferWithRandom(src_, src_stride, bits, w, h);
 
       func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
-      func_(src_, src_stride, dst, dst_stride, w, h, bits);
-
-      diff_wide = w;
-      if (data_path_ == LOWBITDEPTH_TEST) diff_wide >>= 1;
-      for (int r = 0; r < h; ++r) {
-        for (int c = 0; c < diff_wide; ++c) {
-          ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
-              << "Mismatch at r: " << r << " c: " << c << " test: " << test_num;
+      ASM_REGISTER_STATE_CHECK(
+          func_(src_, src_stride, dst, dst_stride, w, h, bits));
+
+      if (data_path_ == LOWBITDEPTH_TEST) {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
         }
       }
 
diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc
index a1fb2087d..ffe0b87d2 100644
--- a/third_party/aom/test/convolve_test.cc
+++ b/third_party/aom/test/convolve_test.cc
@@ -67,18 +67,7 @@ struct ConvolveFunctions {
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-#define ALL_SIZES(convolve_fn)                                            \
-  make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn),   \
-      make_tuple(128, 128, &convolve_fn), make_tuple(4, 4, &convolve_fn), \
-      make_tuple(8, 4, &convolve_fn), make_tuple(4, 8, &convolve_fn),     \
-      make_tuple(8, 8, &convolve_fn), make_tuple(16, 8, &convolve_fn),    \
-      make_tuple(8, 16, &convolve_fn), make_tuple(16, 16, &convolve_fn),  \
-      make_tuple(32, 16, &convolve_fn), make_tuple(16, 32, &convolve_fn), \
-      make_tuple(32, 32, &convolve_fn), make_tuple(64, 32, &convolve_fn), \
-      make_tuple(32, 64, &convolve_fn), make_tuple(64, 64, &convolve_fn)
-#else
-#define ALL_SIZES(convolve_fn)                                            \
+#define ALL_SIZES_64(convolve_fn)                                         \
   make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
       make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn),     \
       make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn),   \
@@ -86,6 +75,13 @@ typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
       make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
       make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
       make_tuple(64, 64, &convolve_fn)
+
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define ALL_SIZES(convolve_fn)                                          \
+  make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \
+      make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn)
+#else
+#define ALL_SIZES ALL_SIZES_64
 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
@@ -414,7 +410,9 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
   void CheckGuardBlocks() {
     for (int i = 0; i < kOutputBufferSize; ++i) {
-      if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]);
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
     }
   }
 
@@ -1282,9 +1280,9 @@ const ConvolveFunctions convolve12_avx2(
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
     wrap_convolve8_avg_c_12, 12);
-const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
-                                               ALL_SIZES(convolve10_avx2),
-                                               ALL_SIZES(convolve12_avx2) };
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2),
+                                               ALL_SIZES_64(convolve10_avx2),
+                                               ALL_SIZES_64(convolve12_avx2) };
 #else
 const ConvolveFunctions convolve8_avx2(
     aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_avx2,
@@ -1293,7 +1291,7 @@ const ConvolveFunctions convolve8_avx2(
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
-const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2) };
 #endif  // CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_avx2));
@@ -1317,10 +1315,10 @@ const ConvolveFunctions convolve8_neon(
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 #endif  // HAVE_NEON_ASM
 
-const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) };
+const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES_64(convolve8_neon) };
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_neon));
-#endif  // HAVE_NEON
+#endif  // HAVE_NEON && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 
 // TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
 #if HAVE_DSPR2 && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
@@ -1331,10 +1329,10 @@ const ConvolveFunctions convolve8_dspr2(
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
-const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) };
+const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES_64(convolve8_dspr2) };
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_dspr2));
-#endif  // HAVE_DSPR2
+#endif  // HAVE_DSPR2 && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 
 // TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
 #if HAVE_MSA && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
@@ -1345,8 +1343,8 @@ const ConvolveFunctions convolve8_msa(
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
-const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
+const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES_64(convolve8_msa) };
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_msa));
-#endif  // HAVE_MSA
+#endif  // HAVE_MSA && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 }  // namespace
diff --git a/third_party/aom/test/datarate_test.cc b/third_party/aom/test/datarate_test.cc
index 6a1b4e101..d577be35a 100644
--- a/third_party/aom/test/datarate_test.cc
+++ b/third_party/aom/test/datarate_test.cc
@@ -89,8 +89,8 @@ class DatarateTestLarge
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
 
     // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                        << pkt->data.frame.pts;
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
 
     const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
diff --git a/third_party/aom/test/dct16x16_test.cc b/third_party/aom/test/dct16x16_test.cc
index c2c072494..3cc0ed8c0 100644
--- a/third_party/aom/test/dct16x16_test.cc
+++ b/third_party/aom/test/dct16x16_test.cc
@@ -230,9 +230,11 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, aom_bit_depth_t> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t> Ht16x16Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, aom_bit_depth_t>
+typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
+    Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t>
+    Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
     Idct16x16Param;
 
 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
@@ -510,8 +512,8 @@ class Trans16x16TestBase {
         const int diff = dst[j] - src[j];
 #endif  // CONFIG_HIGHBITDEPTH
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -778,94 +780,109 @@ using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
                         ::testing::Values(make_tuple(&aom_fdct16x16_c,
                                                      &aom_idct16x16_256_add_c,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
                         ::testing::Values(make_tuple(&aom_fdct16x16_c,
                                                      &aom_idct16x16_256_add_c,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&fht16x16_10, &iht16x16_10, 0, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, 1, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, 2, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, 3, AOM_BITS_10),
-        make_tuple(&fht16x16_12, &iht16x16_12, 0, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, 1, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, 2, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, 3, AOM_BITS_12),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
+        make_tuple(&fht16x16_10, &iht16x16_10, DCT_DCT, AOM_BITS_10),
+        make_tuple(&fht16x16_10, &iht16x16_10, ADST_DCT, AOM_BITS_10),
+        make_tuple(&fht16x16_10, &iht16x16_10, DCT_ADST, AOM_BITS_10),
+        make_tuple(&fht16x16_10, &iht16x16_10, ADST_ADST, AOM_BITS_10),
+        make_tuple(&fht16x16_12, &iht16x16_12, DCT_DCT, AOM_BITS_12),
+        make_tuple(&fht16x16_12, &iht16x16_12, ADST_DCT, AOM_BITS_12),
+        make_tuple(&fht16x16_12, &iht16x16_12, DCT_ADST, AOM_BITS_12),
+        make_tuple(&fht16x16_12, &iht16x16_12, ADST_ADST, AOM_BITS_12),
+        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, DCT_DCT,
+                   AOM_BITS_8),
+        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, ADST_DCT,
+                   AOM_BITS_8),
+        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, DCT_ADST,
+                   AOM_BITS_8),
+        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, ADST_ADST,
+                   AOM_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
+                                 ADST_ADST, AOM_BITS_8)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_neon,
-                                 0, AOM_BITS_8)));
+                                 DCT_DCT, AOM_BITS_8)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16DCT,
-    ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
-                                 &aom_idct16x16_256_add_sse2, 0, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
+                        ::testing::Values(make_tuple(
+                            &aom_fdct16x16_sse2, &aom_idct16x16_256_add_sse2,
+                            DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_DAALA_DCT16
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 0, AOM_BITS_8),
+                                 DCT_DCT, AOM_BITS_8),
                       make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 1, AOM_BITS_8),
+                                 ADST_DCT, AOM_BITS_8),
                       make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 2, AOM_BITS_8),
+                                 DCT_ADST, AOM_BITS_8),
                       make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 3, AOM_BITS_8)));
+                                 ADST_ADST, AOM_BITS_8)));
+#endif  // CONFIG_DAALA_DCT16
 #endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
                         ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
                                                      &aom_idct16x16_256_add_c,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_DAALA_DCT16
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 3,
-                   AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
+                                 ADST_ADST, AOM_BITS_8)));
+#endif
 #endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
 
 #if HAVE_MSA && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT,
                         ::testing::Values(make_tuple(&aom_fdct16x16_msa,
                                                      &aom_idct16x16_256_add_msa,
-                                                     0, AOM_BITS_8)));
-#if !CONFIG_EXT_TX
+                                                     DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT16
 // TODO(yaowu): re-enable this after msa versions are updated to match C.
 INSTANTIATE_TEST_CASE_P(
     DISABLED_MSA, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 0, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 1, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 2, AOM_BITS_8),
-        make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 3,
-                   AOM_BITS_8)));
-#endif  // !CONFIG_EXT_TX
+    ::testing::Values(make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
+                                 ADST_ADST, AOM_BITS_8)));
+#endif  // !CONFIG_EXT_TX && !CONFIG_DAALA_DCT16
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/dct32x32_test.cc b/third_party/aom/test/dct32x32_test.cc
index 0a30f7f38..02a723a9c 100644
--- a/third_party/aom/test/dct32x32_test.cc
+++ b/third_party/aom/test/dct32x32_test.cc
@@ -363,53 +363,63 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_neon,
-                                 0, AOM_BITS_8),
+                                 DCT_DCT, AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_c,
-                                 &aom_idct32x32_1024_add_neon, 1, AOM_BITS_8)));
+                                 &aom_idct32x32_1024_add_neon, ADST_DCT,
+                                 AOM_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
-                                 &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
+                                 AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_sse2,
-                                 &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
+                                 AOM_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_sse2, &aom_idct32x32_1024_add_c,
-                                 0, AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_sse2,
-                                 &aom_idct32x32_1024_add_c, 1, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, Trans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
+                                                     &aom_idct32x32_1024_add_c,
+                                                     DCT_DCT, AOM_BITS_8),
+                                          make_tuple(&aom_fdct32x32_rd_sse2,
+                                                     &aom_idct32x32_1024_add_c,
+                                                     ADST_DCT, AOM_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
 
 #if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
-                                 &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
+                                 AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_avx2,
-                                 &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
+                                 AOM_BITS_8)));
 #endif  // HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_AVX2 && CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
-                                 &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
+                                 AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_avx2,
-                                 &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
+                                 AOM_BITS_8)));
 #endif  // HAVE_AVX2 && CONFIG_HIGHBITDEPTH
 
 #if HAVE_MSA && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_msa,
-                                 &aom_idct32x32_1024_add_msa, 0, AOM_BITS_8),
+                                 &aom_idct32x32_1024_add_msa, DCT_DCT,
+                                 AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_msa,
-                                 &aom_idct32x32_1024_add_msa, 1, AOM_BITS_8)));
+                                 &aom_idct32x32_1024_add_msa, ADST_DCT,
+                                 AOM_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/decode_api_test.cc b/third_party/aom/test/decode_api_test.cc
index 8b1c9d268..187c8e06a 100644
--- a/third_party/aom/test/decode_api_test.cc
+++ b/third_party/aom/test/decode_api_test.cc
@@ -44,8 +44,11 @@ TEST(DecodeAPI, InvalidParams) {
               aom_codec_dec_init(NULL, kCodecs[i], NULL, 0));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+#if !CONFIG_OBU
+    // Needs to be fixed
     EXPECT_EQ(AOM_CODEC_UNSUP_BITSTREAM,
               aom_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0));
+#endif
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
               aom_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL, 0));
diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc
index 5f109e092..9a465327e 100644
--- a/third_party/aom/test/decode_test_driver.cc
+++ b/third_party/aom/test/decode_test_driver.cc
@@ -59,14 +59,15 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
     /* Vp8's implementation of PeekStream returns an error if the frame you
      * pass it is not a keyframe, so we only expect AOM_CODEC_OK on the first
      * frame, which must be a keyframe. */
-    if (video->frame_number() == 0)
-      ASSERT_EQ(AOM_CODEC_OK, res_peek) << "Peek return failed: "
-                                        << aom_codec_err_to_string(res_peek);
+    if (video->frame_number() == 0) {
+      ASSERT_EQ(AOM_CODEC_OK, res_peek)
+          << "Peek return failed: " << aom_codec_err_to_string(res_peek);
+    }
   } else {
     /* The Av1 implementation of PeekStream returns an error only if the
      * data passed to it isn't a valid Av1 chunk. */
-    ASSERT_EQ(AOM_CODEC_OK, res_peek) << "Peek return failed: "
-                                      << aom_codec_err_to_string(res_peek);
+    ASSERT_EQ(AOM_CODEC_OK, res_peek)
+        << "Peek return failed: " << aom_codec_err_to_string(res_peek);
   }
 }
 
diff --git a/third_party/aom/test/dering_test.cc b/third_party/aom/test/dering_test.cc
index 195a60ff8..6b76561c8 100644
--- a/third_party/aom/test/dering_test.cc
+++ b/third_party/aom/test/dering_test.cc
@@ -17,7 +17,7 @@
 #include "./aom_config.h"
 #include "./av1_rtcd.h"
 #include "aom_ports/aom_timer.h"
-#include "av1/common/od_dering.h"
+#include "av1/common/cdef_block.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -27,8 +27,7 @@ using libaom_test::ACMRandom;
 
 namespace {
 
-typedef std::tr1::tuple<od_filter_dering_direction_func,
-                        od_filter_dering_direction_func, int>
+typedef std::tr1::tuple<cdef_direction_func, cdef_direction_func, int>
     dering_dir_param_t;
 
 class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
@@ -44,19 +43,18 @@ class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
 
  protected:
   int bsize;
-  od_filter_dering_direction_func dering;
-  od_filter_dering_direction_func ref_dering;
+  cdef_direction_func dering;
+  cdef_direction_func ref_dering;
 };
 
 typedef CDEFDeringDirTest CDEFDeringSpeedTest;
 
-void test_dering(int bsize, int iterations,
-                 od_filter_dering_direction_func dering,
-                 od_filter_dering_direction_func ref_dering) {
+void test_dering(int bsize, int iterations, cdef_direction_func dering,
+                 cdef_direction_func ref_dering) {
   const int size = 8;
-  const int ysize = size + 2 * OD_FILT_VBORDER;
+  const int ysize = size + 2 * CDEF_VBORDER;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, s[ysize * OD_FILT_BSTRIDE]);
+  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
   DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
   DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
   memset(ref_d, 0, sizeof(ref_d));
@@ -80,38 +78,36 @@ void test_dering(int bsize, int iterations,
               if (boundary) {
                 if (boundary & 1) {  // Left
                   for (int i = 0; i < ysize; i++)
-                    for (int j = 0; j < OD_FILT_HBORDER; j++)
-                      s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
+                    for (int j = 0; j < CDEF_HBORDER; j++)
+                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
                 }
                 if (boundary & 2) {  // Right
                   for (int i = 0; i < ysize; i++)
-                    for (int j = OD_FILT_HBORDER + size; j < OD_FILT_BSTRIDE;
-                         j++)
-                      s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
+                    for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
                 }
                 if (boundary & 4) {  // Above
-                  for (int i = 0; i < OD_FILT_VBORDER; i++)
-                    for (int j = 0; j < OD_FILT_BSTRIDE; j++)
-                      s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
+                  for (int i = 0; i < CDEF_VBORDER; i++)
+                    for (int j = 0; j < CDEF_BSTRIDE; j++)
+                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
                 }
                 if (boundary & 8) {  // Below
-                  for (int i = OD_FILT_VBORDER + size; i < ysize; i++)
-                    for (int j = 0; j < OD_FILT_BSTRIDE; j++)
-                      s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE;
+                  for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                    for (int j = 0; j < CDEF_BSTRIDE; j++)
+                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
                 }
               }
               for (dir = 0; dir < 8; dir++) {
                 for (threshold = 0; threshold < 64 << (depth - 8) && !error;
                      threshold += (1 + 4 * !!boundary) << (depth - 8)) {
-                  ref_dering(ref_d, size, s + OD_FILT_HBORDER +
-                                              OD_FILT_VBORDER * OD_FILT_BSTRIDE,
+                  ref_dering(ref_d, size,
+                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
                              threshold, dir, damping);
                   // If dering and ref_dering are the same, we're just testing
                   // speed
                   if (dering != ref_dering)
                     ASM_REGISTER_STATE_CHECK(dering(
-                        d, size,
-                        s + OD_FILT_HBORDER + OD_FILT_VBORDER * OD_FILT_BSTRIDE,
+                        d, size, s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
                         threshold, dir, damping));
                   if (ref_dering != dering) {
                     for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error;
@@ -146,9 +142,8 @@ void test_dering(int bsize, int iterations,
                       << std::endl;
 }
 
-void test_dering_speed(int bsize, int iterations,
-                       od_filter_dering_direction_func dering,
-                       od_filter_dering_direction_func ref_dering) {
+void test_dering_speed(int bsize, int iterations, cdef_direction_func dering,
+                       cdef_direction_func ref_dering) {
   aom_usec_timer ref_timer;
   aom_usec_timer timer;
 
@@ -173,7 +168,7 @@ void test_dering_speed(int bsize, int iterations,
       << "SIMD time: " << elapsed_time << " us" << std::endl;
 }
 
-typedef int (*find_dir_t)(const od_dering_in *img, int stride, int32_t *var,
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
                           int coeff_shift);
 
 typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
@@ -196,9 +191,9 @@ class CDEFDeringFindDirTest
 
 typedef CDEFDeringFindDirTest CDEFDeringFindDirSpeedTest;
 
-void test_finddir(int (*finddir)(const od_dering_in *img, int stride,
-                                 int32_t *var, int coeff_shift),
-                  int (*ref_finddir)(const od_dering_in *img, int stride,
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+                                 int coeff_shift),
+                  int (*ref_finddir)(const uint16_t *img, int stride,
                                      int32_t *var, int coeff_shift)) {
   const int size = 8;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -238,9 +233,9 @@ void test_finddir(int (*finddir)(const od_dering_in *img, int stride,
                       << std::endl;
 }
 
-void test_finddir_speed(int (*finddir)(const od_dering_in *img, int stride,
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
                                        int32_t *var, int coeff_shift),
-                        int (*ref_finddir)(const od_dering_in *img, int stride,
+                        int (*ref_finddir)(const uint16_t *img, int stride,
                                            int32_t *var, int coeff_shift)) {
   aom_usec_timer ref_timer;
   aom_usec_timer timer;
@@ -289,99 +284,99 @@ using std::tr1::make_tuple;
 // hard to support, so optimizations for this target are disabled.
 #if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, CDEFDeringDirTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_sse2,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_sse2,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringDirTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse2,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_sse2,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_sse2,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
 #endif
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, CDEFDeringDirTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_ssse3,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_ssse3,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringDirTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_ssse3,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_ssse3,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_ssse3,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, CDEFDeringDirTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_sse4_1,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_sse4_1,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringDirTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse4_1,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_sse4_1,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_sse4_1,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, CDEFDeringDirTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_neon,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_neon,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringDirTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_neon,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_neon,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_neon,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
 #endif
 
 // Test speed for all supported architectures
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, CDEFDeringSpeedTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_sse2,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_sse2,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse2,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_sse2,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_sse2,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, CDEFDeringSpeedTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_ssse3,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_ssse3,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_ssse3,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_ssse3,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_ssse3,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, CDEFDeringSpeedTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_sse4_1,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_sse4_1,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse4_1,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_sse4_1,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_sse4_1,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, CDEFDeringSpeedTest,
-    ::testing::Values(make_tuple(&od_filter_dering_direction_4x4_neon,
-                                 &od_filter_dering_direction_4x4_c, 4),
-                      make_tuple(&od_filter_dering_direction_8x8_neon,
-                                 &od_filter_dering_direction_8x8_c, 8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_direction_4x4_neon,
+                                                     &cdef_direction_4x4_c, 4),
+                                          make_tuple(&cdef_direction_8x8_neon,
+                                                     &cdef_direction_8x8_c,
+                                                     8)));
 INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&od_dir_find8_neon,
-                                                     &od_dir_find8_c)));
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
 #endif
 
 #endif  // defined(_WIN64) || !defined(_MSC_VER)
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
index ec168e969..6941f0148 100644
--- a/third_party/aom/test/encode_test_driver.cc
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -149,11 +149,6 @@ static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
                         int *const mismatch_row, int *const mismatch_col,
                         int *const mismatch_plane, int *const mismatch_pix1,
                         int *const mismatch_pix2) {
-  const unsigned int w_y = img1->d_w;
-  const unsigned int h_y = img1->d_h;
-  const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
-  const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
-
   if (img1->fmt != img2->fmt || img1->cs != img2->cs ||
       img1->d_w != img2->d_w || img1->d_h != img2->d_h) {
     if (mismatch_row != NULL) *mismatch_row = -1;
@@ -161,28 +156,15 @@ static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
     return false;
   }
 
-  if (!compare_plane(img1->planes[AOM_PLANE_Y], img1->stride[AOM_PLANE_Y],
-                     img2->planes[AOM_PLANE_Y], img2->stride[AOM_PLANE_Y], w_y,
-                     h_y, mismatch_row, mismatch_col, mismatch_pix1,
-                     mismatch_pix2)) {
-    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_Y;
-    return false;
-  }
-
-  if (!compare_plane(img1->planes[AOM_PLANE_U], img1->stride[AOM_PLANE_U],
-                     img2->planes[AOM_PLANE_U], img2->stride[AOM_PLANE_U], w_uv,
-                     h_uv, mismatch_row, mismatch_col, mismatch_pix1,
-                     mismatch_pix2)) {
-    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_U;
-    return false;
-  }
-
-  if (!compare_plane(img1->planes[AOM_PLANE_V], img1->stride[AOM_PLANE_V],
-                     img2->planes[AOM_PLANE_V], img2->stride[AOM_PLANE_V], w_uv,
-                     h_uv, mismatch_row, mismatch_col, mismatch_pix1,
-                     mismatch_pix2)) {
-    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_U;
-    return false;
+  for (int plane = 0; plane < 3; plane++) {
+    if (!compare_plane(img1->planes[plane], img1->stride[plane],
+                       img2->planes[plane], img2->stride[plane],
+                       aom_img_plane_width(img1, plane),
+                       aom_img_plane_height(img1, plane), mismatch_row,
+                       mismatch_col, mismatch_pix1, mismatch_pix2)) {
+      if (mismatch_plane != NULL) *mismatch_plane = plane;
+      return false;
+    }
   }
 
   return true;
diff --git a/third_party/aom/test/end_to_end_test.cc b/third_party/aom/test/end_to_end_test.cc
index 93bc1625e..e1a833ec4 100644
--- a/third_party/aom/test/end_to_end_test.cc
+++ b/third_party/aom/test/end_to_end_test.cc
@@ -128,13 +128,11 @@ class EndToEndTest
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
-#if CONFIG_PALETTE
       // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
       if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood)
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
       else
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
-#endif  // CONFIG_PALETTE
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
diff --git a/third_party/aom/test/fdct4x4_test.cc b/third_party/aom/test/fdct4x4_test.cc
index e1bd61254..5fad1667b 100644
--- a/third_party/aom/test/fdct4x4_test.cc
+++ b/third_party/aom/test/fdct4x4_test.cc
@@ -36,9 +36,10 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 using libaom_test::FhtFunc;
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, aom_bit_depth_t, int>
+typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
     Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int>
+    Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
                  TxfmParam * /*txfm_param*/) {
@@ -211,119 +212,139 @@ using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_c,
-                                                     &aom_idct4x4_16_add_c, 0,
-                                                     AOM_BITS_8, 16)));
+                                                     &aom_idct4x4_16_add_c,
+                                                     DCT_DCT, AOM_BITS_8, 16)));
 
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     DISABLED_C, Trans4x4HT,
-    ::testing::Values(make_tuple(&fht4x4_12, &iht4x4_12, 0, AOM_BITS_12, 16),
-                      make_tuple(&fht4x4_12, &iht4x4_12, 1, AOM_BITS_12, 16),
-                      make_tuple(&fht4x4_12, &iht4x4_12, 2, AOM_BITS_12, 16),
-                      make_tuple(&fht4x4_12, &iht4x4_12, 3, AOM_BITS_12, 16)));
+    ::testing::Values(
+        make_tuple(&fht4x4_12, &iht4x4_12, DCT_DCT, AOM_BITS_12, 16),
+        make_tuple(&fht4x4_12, &iht4x4_12, ADST_DCT, AOM_BITS_12, 16),
+        make_tuple(&fht4x4_12, &iht4x4_12, DCT_ADST, AOM_BITS_12, 16),
+        make_tuple(&fht4x4_12, &iht4x4_12, ADST_ADST, AOM_BITS_12, 16)));
 
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&fht4x4_10, &iht4x4_10, 0, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, 1, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, 2, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, 3, AOM_BITS_10, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
+        make_tuple(&fht4x4_10, &iht4x4_10, DCT_DCT, AOM_BITS_10, 16),
+        make_tuple(&fht4x4_10, &iht4x4_10, ADST_DCT, AOM_BITS_10, 16),
+        make_tuple(&fht4x4_10, &iht4x4_10, DCT_ADST, AOM_BITS_10, 16),
+        make_tuple(&fht4x4_10, &iht4x4_10, ADST_ADST, AOM_BITS_10, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_DCT, AOM_BITS_8,
+                   16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_DCT, AOM_BITS_8,
+                   16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_ADST, AOM_BITS_8,
+                   16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_ADST, AOM_BITS_8,
+                   16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
+    ::testing::Values(make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_DCT,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_DCT,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_ADST,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_ADST,
+                                 AOM_BITS_8, 16)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, 0, AOM_BITS_10, 16),
-        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, 0, AOM_BITS_12, 16),
-        make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, 0, AOM_BITS_8, 16)));
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16),
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16),
+                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
+                                 AOM_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
                         ::testing::Values(make_tuple(&av1_fwht4x4_c,
-                                                     &aom_iwht4x4_16_add_c, 0,
-                                                     AOM_BITS_8, 16)));
+                                                     &aom_iwht4x4_16_add_c,
+                                                     DCT_DCT, AOM_BITS_8, 16)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_c,
                                                      &aom_idct4x4_16_add_neon,
-                                                     0, AOM_BITS_8, 16)));
+                                                     DCT_DCT, AOM_BITS_8, 16)));
 #endif  // HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 0, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 1, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 2, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 3, AOM_BITS_8, 16)));
+    ::testing::Values(make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
+                                 DCT_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
+                                 ADST_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
+                                 DCT_ADST, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
+                                 ADST_ADST, AOM_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_DAALA_DCT4
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, 0,
+    ::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
                                  AOM_BITS_8, 16),
-                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2, 0,
-                                 AOM_BITS_8, 16)));
+                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2,
+                                 DCT_DCT, AOM_BITS_8, 16)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
                                                      &aom_idct4x4_16_add_sse2,
-                                                     0, AOM_BITS_8, 16)));
+                                                     DCT_DCT, AOM_BITS_8, 16)));
+#if !CONFIG_DAALA_DCT4
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 0,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 1,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 2,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 3,
-                                 AOM_BITS_8, 16)));
+    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
+                                 DCT_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
+                                 ADST_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
+                                 DCT_ADST, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
+                                 ADST_ADST, AOM_BITS_8, 16)));
+#endif  // !CONFIG_DAALA_DCT4
 #endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 
-#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
-#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
+    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
+                                 DCT_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
+                                 ADST_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
+                                 DCT_ADST, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
+                                 ADST_ADST, AOM_BITS_8, 16)));
+#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
 
 #if HAVE_MSA && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_msa,
-                                                     &aom_idct4x4_16_add_msa, 0,
-                                                     AOM_BITS_8, 16)));
-#if !CONFIG_EXT_TX
+                                                     &aom_idct4x4_16_add_msa,
+                                                     DCT_DCT, AOM_BITS_8, 16)));
+#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT4
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 0, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 1, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 2, AOM_BITS_8, 16),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 3, AOM_BITS_8,
-                   16)));
-#endif  // !CONFIG_EXT_TX
+    ::testing::Values(make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
+                                 DCT_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
+                                 ADST_DCT, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
+                                 DCT_ADST, AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
+                                 ADST_ADST, AOM_BITS_8, 16)));
+#endif  // !CONFIG_EXT_TX && && !CONFIG_DAALA_DCT4
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/fdct8x8_test.cc b/third_party/aom/test/fdct8x8_test.cc
index 62cdf6229..99ae8d677 100644
--- a/third_party/aom/test/fdct8x8_test.cc
+++ b/third_party/aom/test/fdct8x8_test.cc
@@ -44,8 +44,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         const TxfmParam *txfm_param);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, aom_bit_depth_t> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
+    Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t> Ht8x8Param;
 typedef std::tr1::tuple<IdctFunc, IdctFunc, int, aom_bit_depth_t> Idct8x8Param;
 
 void reference_8x8_dct_1d(const double in[8], double out[8]) {
@@ -485,8 +486,8 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -614,108 +615,124 @@ using std::tr1::make_tuple;
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_c,
-                                                     &aom_idct8x8_64_add_c, 0,
-                                                     AOM_BITS_8)));
+                                                     &aom_idct8x8_64_add_c,
+                                                     DCT_DCT, AOM_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_c,
-                                                     &aom_idct8x8_64_add_c, 0,
-                                                     AOM_BITS_8)));
+                                                     &aom_idct8x8_64_add_c,
+                                                     DCT_DCT, AOM_BITS_8)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 0, AOM_BITS_8),
-        make_tuple(&fht8x8_10, &iht8x8_10, 0, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, 1, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, 2, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, 3, AOM_BITS_10),
-        make_tuple(&fht8x8_12, &iht8x8_12, 0, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, 1, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, 2, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, 3, AOM_BITS_12),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_DCT, AOM_BITS_8),
+        make_tuple(&fht8x8_10, &iht8x8_10, DCT_DCT, AOM_BITS_10),
+        make_tuple(&fht8x8_10, &iht8x8_10, ADST_DCT, AOM_BITS_10),
+        make_tuple(&fht8x8_10, &iht8x8_10, DCT_ADST, AOM_BITS_10),
+        make_tuple(&fht8x8_10, &iht8x8_10, ADST_ADST, AOM_BITS_10),
+        make_tuple(&fht8x8_12, &iht8x8_12, DCT_DCT, AOM_BITS_12),
+        make_tuple(&fht8x8_12, &iht8x8_12, ADST_DCT, AOM_BITS_12),
+        make_tuple(&fht8x8_12, &iht8x8_12, DCT_ADST, AOM_BITS_12),
+        make_tuple(&fht8x8_12, &iht8x8_12, ADST_ADST, AOM_BITS_12),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_DCT, AOM_BITS_8),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_ADST, AOM_BITS_8),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_ADST,
+                   AOM_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_DCT, AOM_BITS_8),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_DCT, AOM_BITS_8),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_ADST, AOM_BITS_8),
+        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_ADST,
+                   AOM_BITS_8)));
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_neon,
                                                      &aom_idct8x8_64_add_neon,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
 #endif  // HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon, 0, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon, 3, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
+                                 ADST_ADST, AOM_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
                                                      &aom_idct8x8_64_add_sse2,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_DAALA_DCT8
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 0, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 3, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
+                                 ADST_ADST, AOM_BITS_8)));
+#endif  // !CONFIG_DAALA_DCT8
 #endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
-                                                     &aom_idct8x8_64_add_c, 0,
-                                                     AOM_BITS_8)));
-
+                                                     &aom_idct8x8_64_add_c,
+                                                     DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_DAALA_DCT8
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c, 3, AOM_BITS_8)));
-
+    ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
+                                 ADST_ADST, AOM_BITS_8)));
+#endif  // !CONFIG_DAALA_DCT8
 #endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSSE3 && ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
                                                      &aom_idct8x8_64_add_ssse3,
-                                                     0, AOM_BITS_8)));
+                                                     DCT_DCT, AOM_BITS_8)));
 #endif
 
 #if HAVE_MSA && !CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(MSA, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_msa,
-                                                     &aom_idct8x8_64_add_msa, 0,
-                                                     AOM_BITS_8)));
-#if !CONFIG_EXT_TX
+                                                     &aom_idct8x8_64_add_msa,
+                                                     DCT_DCT, AOM_BITS_8)));
+#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT8
 INSTANTIATE_TEST_CASE_P(
     MSA, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 0, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 1, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 2, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 3, AOM_BITS_8)));
-#endif  // !CONFIG_EXT_TX
+    ::testing::Values(make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
+                                 DCT_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
+                                 ADST_DCT, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
+                                 DCT_ADST, AOM_BITS_8),
+                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
+                                 ADST_ADST, AOM_BITS_8)));
+#endif  // !CONFIG_EXT_TX && !CONFIG_DAALA_DCT8
 #endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test.cc b/third_party/aom/test/hiprec_convolve_test.cc
index 0b34c99c9..78e109c9d 100644
--- a/third_party/aom/test/hiprec_convolve_test.cc
+++ b/third_party/aom/test/hiprec_convolve_test.cc
@@ -22,11 +22,13 @@ using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
 
 namespace {
 
+#if HAVE_SSE2
 TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
 
 INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
                         libaom_test::AV1HiprecConvolve::BuildParams(
                             aom_convolve8_add_src_hip_sse2));
+#endif
 
 #if CONFIG_HIGHBITDEPTH && HAVE_SSSE3
 TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
diff --git a/third_party/aom/test/hiprec_convolve_test_util.cc b/third_party/aom/test/hiprec_convolve_test_util.cc
index f5661ec07..4dee6ab4d 100644
--- a/third_party/aom/test/hiprec_convolve_test_util.cc
+++ b/third_party/aom/test/hiprec_convolve_test_util.cc
@@ -100,9 +100,9 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
               vkernel, 16, out_w, out_h);
 
     for (j = 0; j < out_w * out_h; ++j)
-      ASSERT_EQ(output[j], output2[j]) << "Pixel mismatch at index " << j
-                                       << " = (" << (j % out_w) << ", "
-                                       << (j / out_w) << ") on iteration " << i;
+      ASSERT_EQ(output[j], output2[j])
+          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+          << (j / out_w) << ") on iteration " << i;
   }
   delete[] input_;
   delete[] output;
@@ -175,9 +175,9 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
               hkernel, 16, vkernel, 16, out_w, out_h, bd);
 
     for (j = 0; j < out_w * out_h; ++j)
-      ASSERT_EQ(output[j], output2[j]) << "Pixel mismatch at index " << j
-                                       << " = (" << (j % out_w) << ", "
-                                       << (j / out_w) << ") on iteration " << i;
+      ASSERT_EQ(output[j], output2[j])
+          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+          << (j / out_w) << ") on iteration " << i;
   }
   delete[] input;
   delete[] output;
diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc
index 5dd8c00be..12da1601c 100644
--- a/third_party/aom/test/intrapred_test.cc
+++ b/third_party/aom/test/intrapred_test.cc
@@ -29,27 +29,35 @@ using libaom_test::ACMRandom;
 
 const int count_test_block = 100000;
 
-typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
-                          const uint16_t *above, const uint16_t *left, int bps);
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left);
 
+template <typename FuncType>
 struct IntraPredFunc {
-  IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
-                int block_size_value = 0, int bit_depth_value = 0)
-      : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
-        bit_depth(bit_depth_value) {}
-
-  IntraPred pred_fn;
-  IntraPred ref_fn;
-  int block_size;
+  IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL,
+                int block_width_value = 0, int block_height_value = 0,
+                int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_width(block_width_value),
+        block_height(block_height_value), bit_depth(bit_depth_value) {}
+
+  FuncType pred_fn;
+  FuncType ref_fn;
+  int block_width;
+  int block_height;
   int bit_depth;
 };
 
-class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
+template <typename FuncType, typename Pixel>
+class AV1IntraPredTest
+    : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
  public:
-  void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
-               uint16_t *ref_dst) {
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int block_size = params_.block_size;
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
     above_row_ = above_data + 16;
     left_col_ = left_col;
     dst_ = dst;
@@ -57,14 +65,14 @@ class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
       // Fill edges with random data, try first with saturated values.
-      for (int x = -1; x <= block_size * 2; x++) {
+      for (int x = -1; x <= block_width * 2; x++) {
         if (i == 0) {
           above_row_[x] = mask_;
         } else {
           above_row_[x] = rnd.Rand16() & mask_;
         }
       }
-      for (int y = 0; y < block_size; y++) {
+      for (int y = 0; y < block_height; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
         } else {
@@ -79,43 +87,59 @@ class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
 
  protected:
   virtual void SetUp() {
-    params_ = GetParam();
-    stride_ = params_.block_size * 3;
+    params_ = this->GetParam();
+    stride_ = params_.block_width * 3;
     mask_ = (1 << params_.bit_depth) - 1;
   }
 
-  void Predict() {
-    const int bit_depth = params_.bit_depth;
-    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
-        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
-  }
+  virtual void Predict() = 0;
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
-    const int block_size = params_.block_size;
-    for (int y = 0; y < block_size; y++) {
-      for (int x = 0; x < block_size; x++) {
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    for (int y = 0; y < block_height; y++) {
+      for (int x = 0; x < block_width; x++) {
         *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
         if (*error_count == 1) {
           ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
-              << " Failed on Test Case Number " << test_case_number;
+              << " Failed on Test Case Number " << test_case_number
+              << " location: x = " << x << " y = " << y;
         }
       }
     }
   }
 
-  uint16_t *above_row_;
-  uint16_t *left_col_;
-  uint16_t *dst_;
-  uint16_t *ref_dst_;
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
   ptrdiff_t stride_;
   int mask_;
 
-  IntraPredFunc params_;
+  IntraPredFunc<FuncType> params_;
+};
+
+class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
+ protected:
+  void Predict() {
+    const int bit_depth = params_.bit_depth;
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+  }
+};
+
+class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
+ protected:
+  void Predict() {
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_));
+  }
 };
 
-TEST_P(AV1IntraPredTest, IntraPredTests) {
+TEST_P(HighbdIntraPredTest, Bitexact) {
   // max block size is 32
   DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
   DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
@@ -124,114 +148,186 @@ TEST_P(AV1IntraPredTest, IntraPredTests) {
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
-#if HAVE_SSE2
+TEST_P(LowbdIntraPredTest, Bitexact) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+// -----------------------------------------------------------------------------
+// High Bit Depth Tests
+
+#define highbd_entry(type, width, height, opt, bd)                          \
+  IntraPredFunc<HighbdIntraPred>(                                           \
+      &aom_highbd_##type##_predictor_##width##x##height##_##opt,            \
+      &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
+      bd)
+
+#define highbd_intrapred(type, opt, bd)                                       \
+  highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd),       \
+      highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd),   \
+      highbd_entry(type, 8, 16, opt, bd), highbd_entry(type, 16, 8, opt, bd), \
+      highbd_entry(type, 16, 16, opt, bd),                                    \
+      highbd_entry(type, 16, 32, opt, bd),                                    \
+      highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
+
 #if CONFIG_HIGHBITDEPTH
-const IntraPredFunc IntraPredTestVector8[] = {
-  IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
-                &aom_highbd_dc_predictor_32x32_c, 32, 8),
-#if !CONFIG_ALT_INTRA
-  IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
-                &aom_highbd_tm_predictor_16x16_c, 16, 8),
-  IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
-                &aom_highbd_tm_predictor_32x32_c, 32, 8),
-#endif  // !CONFIG_ALT_INTRA
-
-  IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
-                &aom_highbd_dc_predictor_4x4_c, 4, 8),
-  IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
-                &aom_highbd_dc_predictor_8x8_c, 8, 8),
-  IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
-                &aom_highbd_dc_predictor_16x16_c, 16, 8),
-  IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2, &aom_highbd_v_predictor_4x4_c,
-                4, 8),
-  IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2, &aom_highbd_v_predictor_8x8_c,
-                8, 8),
-  IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
-                &aom_highbd_v_predictor_16x16_c, 16, 8),
-  IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
-                &aom_highbd_v_predictor_32x32_c, 32, 8)
-#if !CONFIG_ALT_INTRA
-      ,
-  IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
-                &aom_highbd_tm_predictor_4x4_c, 4, 8),
-  IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
-                &aom_highbd_tm_predictor_8x8_c, 8, 8)
-#endif  // !CONFIG_ALT_INTRA
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, AV1IntraPredTest,
+#if HAVE_SSE2
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector8[] = {
+  highbd_intrapred(dc, sse2, 8),     highbd_intrapred(dc_left, sse2, 8),
+  highbd_intrapred(dc_top, sse2, 8), highbd_intrapred(dc_128, sse2, 8),
+  highbd_intrapred(h, sse2, 8),      highbd_intrapred(v, sse2, 8),
+  highbd_entry(d117, 4, 4, sse2, 8), highbd_entry(d135, 4, 4, sse2, 8),
+  highbd_entry(d153, 4, 4, sse2, 8), highbd_entry(d45e, 4, 4, sse2, 8),
+  highbd_entry(d45e, 4, 8, sse2, 8), highbd_entry(d45e, 8, 4, sse2, 8),
+  highbd_entry(d45e, 8, 8, sse2, 8), highbd_entry(d45e, 8, 16, sse2, 8),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector8));
 
-const IntraPredFunc IntraPredTestVector10[] = {
-  IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
-                &aom_highbd_dc_predictor_32x32_c, 32, 10),
-#if !CONFIG_ALT_INTRA
-  IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
-                &aom_highbd_tm_predictor_16x16_c, 16, 10),
-  IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
-                &aom_highbd_tm_predictor_32x32_c, 32, 10),
-#endif  // !CONFIG_ALT_INTRA
-  IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
-                &aom_highbd_dc_predictor_4x4_c, 4, 10),
-  IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
-                &aom_highbd_dc_predictor_8x8_c, 8, 10),
-  IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
-                &aom_highbd_dc_predictor_16x16_c, 16, 10),
-  IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2, &aom_highbd_v_predictor_4x4_c,
-                4, 10),
-  IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2, &aom_highbd_v_predictor_8x8_c,
-                8, 10),
-  IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
-                &aom_highbd_v_predictor_16x16_c, 16, 10),
-  IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
-                &aom_highbd_v_predictor_32x32_c, 32, 10)
-#if !CONFIG_ALT_INTRA
-      ,
-  IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
-                &aom_highbd_tm_predictor_4x4_c, 4, 10),
-  IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
-                &aom_highbd_tm_predictor_8x8_c, 8, 10)
-#endif  // !CONFIG_ALT_INTRA
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, AV1IntraPredTest,
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector10[] = {
+  highbd_intrapred(dc, sse2, 10),     highbd_intrapred(dc_left, sse2, 10),
+  highbd_intrapred(dc_top, sse2, 10), highbd_intrapred(dc_128, sse2, 10),
+  highbd_intrapred(h, sse2, 10),      highbd_intrapred(v, sse2, 10),
+  highbd_entry(d117, 4, 4, sse2, 10), highbd_entry(d135, 4, 4, sse2, 10),
+  highbd_entry(d153, 4, 4, sse2, 10), highbd_entry(d45e, 4, 4, sse2, 10),
+  highbd_entry(d45e, 4, 8, sse2, 10), highbd_entry(d45e, 8, 4, sse2, 10),
+  highbd_entry(d45e, 8, 8, sse2, 10), highbd_entry(d45e, 8, 16, sse2, 10),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector10));
 
-const IntraPredFunc IntraPredTestVector12[] = {
-  IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
-                &aom_highbd_dc_predictor_32x32_c, 32, 12),
-#if !CONFIG_ALT_INTRA
-  IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
-                &aom_highbd_tm_predictor_16x16_c, 16, 12),
-  IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
-                &aom_highbd_tm_predictor_32x32_c, 32, 12),
-#endif  // !CONFIG_ALT_INTRA
-  IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
-                &aom_highbd_dc_predictor_4x4_c, 4, 12),
-  IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
-                &aom_highbd_dc_predictor_8x8_c, 8, 12),
-  IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
-                &aom_highbd_dc_predictor_16x16_c, 16, 12),
-  IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2, &aom_highbd_v_predictor_4x4_c,
-                4, 12),
-  IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2, &aom_highbd_v_predictor_8x8_c,
-                8, 12),
-  IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
-                &aom_highbd_v_predictor_16x16_c, 16, 12),
-  IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
-                &aom_highbd_v_predictor_32x32_c, 32, 12)
-#if !CONFIG_ALT_INTRA
-      ,
-  IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
-                &aom_highbd_tm_predictor_4x4_c, 4, 12),
-  IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
-                &aom_highbd_tm_predictor_8x8_c, 8, 12)
-#endif  // !CONFIG_ALT_INTRA
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, AV1IntraPredTest,
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector12[] = {
+  highbd_intrapred(dc, sse2, 12),     highbd_intrapred(dc_left, sse2, 12),
+  highbd_intrapred(dc_top, sse2, 12), highbd_intrapred(dc_128, sse2, 12),
+  highbd_intrapred(h, sse2, 12),      highbd_intrapred(v, sse2, 12),
+  highbd_entry(d117, 4, 4, sse2, 12), highbd_entry(d135, 4, 4, sse2, 12),
+  highbd_entry(d153, 4, 4, sse2, 12), highbd_entry(d45e, 4, 4, sse2, 12),
+  highbd_entry(d45e, 4, 8, sse2, 12), highbd_entry(d45e, 8, 4, sse2, 12),
+  highbd_entry(d45e, 8, 8, sse2, 12), highbd_entry(d45e, 8, 16, sse2, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector12));
 
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_8[] = {
+  highbd_entry(d117, 8, 8, ssse3, 8),   highbd_entry(d117, 16, 16, ssse3, 8),
+  highbd_entry(d117, 32, 32, ssse3, 8), highbd_entry(d135, 8, 8, ssse3, 8),
+  highbd_entry(d135, 16, 16, ssse3, 8), highbd_entry(d135, 32, 32, ssse3, 8),
+  highbd_entry(d153, 8, 8, ssse3, 8),   highbd_entry(d153, 16, 16, ssse3, 8),
+  highbd_entry(d153, 32, 32, ssse3, 8),
+};
+INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_8, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorSsse3_8));
+
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_10[] = {
+  highbd_entry(d117, 8, 8, ssse3, 10),   highbd_entry(d117, 16, 16, ssse3, 10),
+  highbd_entry(d117, 32, 32, ssse3, 10), highbd_entry(d135, 8, 8, ssse3, 10),
+  highbd_entry(d135, 16, 16, ssse3, 10), highbd_entry(d135, 32, 32, ssse3, 10),
+  highbd_entry(d153, 8, 8, ssse3, 10),   highbd_entry(d153, 16, 16, ssse3, 10),
+  highbd_entry(d153, 32, 32, ssse3, 10),
+};
+INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_10, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorSsse3_10));
+
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_12[] = {
+  highbd_entry(d117, 8, 8, ssse3, 12),   highbd_entry(d117, 16, 16, ssse3, 12),
+  highbd_entry(d117, 32, 32, ssse3, 12), highbd_entry(d135, 8, 8, ssse3, 12),
+  highbd_entry(d135, 16, 16, ssse3, 12), highbd_entry(d135, 32, 32, ssse3, 12),
+  highbd_entry(d153, 8, 8, ssse3, 12),   highbd_entry(d153, 16, 16, ssse3, 12),
+  highbd_entry(d153, 32, 32, ssse3, 12),
+};
+INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_12, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorSsse3_12));
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_8[] = {
+  highbd_entry(d45e, 16, 8, avx2, 8),  highbd_entry(d45e, 16, 16, avx2, 8),
+  highbd_entry(d45e, 16, 32, avx2, 8), highbd_entry(d45e, 32, 16, avx2, 8),
+  highbd_entry(d45e, 32, 32, avx2, 8),
+};
+INSTANTIATE_TEST_CASE_P(AVX2_TO_C_8, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorAvx2_8));
+
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_10[] = {
+  highbd_entry(d45e, 16, 8, avx2, 10),  highbd_entry(d45e, 16, 16, avx2, 10),
+  highbd_entry(d45e, 16, 32, avx2, 10), highbd_entry(d45e, 32, 16, avx2, 10),
+  highbd_entry(d45e, 32, 32, avx2, 10),
+};
+INSTANTIATE_TEST_CASE_P(AVX2_TO_C_10, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorAvx2_10));
+
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_12[] = {
+  highbd_entry(d45e, 16, 8, avx2, 12),  highbd_entry(d45e, 16, 16, avx2, 12),
+  highbd_entry(d45e, 16, 32, avx2, 12), highbd_entry(d45e, 32, 16, avx2, 12),
+  highbd_entry(d45e, 32, 32, avx2, 12),
+};
+INSTANTIATE_TEST_CASE_P(AVX2_TO_C_12, HighbdIntraPredTest,
+                        ::testing::ValuesIn(IntraPredTestVectorAvx2_12));
+#endif  // HAVE_AVX2
 #endif  // CONFIG_HIGHBITDEPTH
+
+// -----------------------------------------------------------------------------
+// Low Bit Depth Tests
+
+#define lowbd_entry(type, width, height, opt)                                  \
+  IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
+                           &aom_##type##_predictor_##width##x##height##_c,     \
+                           width, height, 8)
+
+#define lowbd_intrapred(type, opt)                                    \
+  lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt),         \
+      lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt),     \
+      lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt),   \
+      lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \
+      lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt)
+
+#if HAVE_SSE2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
+  lowbd_intrapred(dc, sse2),      lowbd_intrapred(dc_top, sse2),
+  lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
+  lowbd_intrapred(v, sse2),       lowbd_intrapred(h, sse2),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVector));
+
 #endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
+  lowbd_entry(dc, 32, 32, avx2),      lowbd_entry(dc_top, 32, 32, avx2),
+  lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2),
+  lowbd_entry(v, 32, 32, avx2),       lowbd_entry(h, 32, 32, avx2),
+  lowbd_entry(dc, 32, 16, avx2),      lowbd_entry(dc_top, 32, 16, avx2),
+  lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
+  lowbd_entry(v, 32, 16, avx2),       lowbd_entry(paeth, 16, 8, avx2),
+  lowbd_entry(paeth, 16, 16, avx2),   lowbd_entry(paeth, 16, 32, avx2),
+  lowbd_entry(paeth, 32, 16, avx2),   lowbd_entry(paeth, 32, 32, avx2),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
+  lowbd_intrapred(paeth, ssse3), lowbd_intrapred(smooth, ssse3),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+
+#endif  // HAVE_SSSE3
+
 }  // namespace
diff --git a/third_party/aom/test/ivf_video_source.h b/third_party/aom/test/ivf_video_source.h
index 0d3e9f9cb..956c145ac 100644
--- a/third_party/aom/test/ivf_video_source.h
+++ b/third_party/aom/test/ivf_video_source.h
@@ -48,8 +48,8 @@ class IVFVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
     uint8_t file_hdr[kIvfFileHdrSize];
diff --git a/third_party/aom/test/lpf_8_test.cc b/third_party/aom/test/lpf_8_test.cc
index f050718bb..4859a8ee7 100644
--- a/third_party/aom/test/lpf_8_test.cc
+++ b/third_party/aom/test/lpf_8_test.cc
@@ -35,6 +35,8 @@ const int kNumCoeffs = 1024;
 
 const int number_of_iterations = 10000;
 
+const int kSpeedTestNum = 500000;
+
 #if CONFIG_HIGHBITDEPTH
 typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -242,6 +244,43 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
+TEST_P(Loop8Test6Param, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = kSpeedTestNum;
+#if CONFIG_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = rnd.Rand8();
+  DECLARE_ALIGNED(16, const uint8_t,
+                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+
+  int32_t p = kNumCoeffs / 32;
+  for (int j = 0; j < kNumCoeffs; ++j) {
+    s[j] = rnd.Rand16() & mask_;
+  }
+
+  for (int i = 0; i < count_test_block; ++i) {
+#if CONFIG_HIGHBITDEPTH
+    loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd);
+#else
+    loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh);
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
 TEST_P(Loop8Test9Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = number_of_iterations;
@@ -408,9 +447,59 @@ TEST_P(Loop8Test9Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
+TEST_P(Loop8Test9Param, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = kSpeedTestNum;
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = rnd.Rand8();
+  DECLARE_ALIGNED(16, const uint8_t,
+                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+  DECLARE_ALIGNED(16, const uint8_t,
+                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  tmp = rnd.Rand8();
+  DECLARE_ALIGNED(16, const uint8_t,
+                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
+  int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
+  for (int j = 0; j < kNumCoeffs; ++j) {
+    s[j] = rnd.Rand16() & mask_;
+  }
+
+  for (int i = 0; i < count_test_block; ++i) {
+#if CONFIG_HIGHBITDEPTH
+    const int32_t bd = bit_depth_;
+    loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, limit1,
+                   thresh1, bd);
+#else
+    loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, limit1,
+                   thresh1);
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING)
+#if HAVE_SSE2
 #if CONFIG_HIGHBITDEPTH
 
 const loop8_param_t kHbdLoop8Test6[] = {
@@ -470,12 +559,38 @@ const loop8_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
   make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
   make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
+#if !CONFIG_PARALLEL_DEBLOCKING
   make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
                         ::testing::ValuesIn(kLoop8Test6));
 #endif  // CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+#if CONFIG_HIGHBITDEPTH
+
+const loop8_param_t kHbdLoop8Test6Avx2[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
+             &aom_highbd_lpf_horizontal_edge_16_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
+             &aom_highbd_lpf_horizontal_edge_16_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
+             &aom_highbd_lpf_horizontal_edge_16_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
+             &aom_highbd_lpf_vertical_16_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
+             &aom_highbd_lpf_vertical_16_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
+             &aom_highbd_lpf_vertical_16_dual_c, 12)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test6Param,
+                        ::testing::ValuesIn(kHbdLoop8Test6Avx2));
+
+#endif
 #endif
 
 #if HAVE_AVX2 && (!CONFIG_HIGHBITDEPTH) && (!CONFIG_PARALLEL_DEBLOCKING)
@@ -487,7 +602,7 @@ INSTANTIATE_TEST_CASE_P(
                                  &aom_lpf_horizontal_edge_16_c, 8)));
 #endif
 
-#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING)
+#if HAVE_SSE2
 #if CONFIG_HIGHBITDEPTH
 const dualloop8_param_t kHbdLoop8Test9[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
@@ -519,6 +634,7 @@ const dualloop8_param_t kHbdLoop8Test9[] = {
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
                         ::testing::ValuesIn(kHbdLoop8Test9));
 #else
+#if !CONFIG_PARALLEL_DEBLOCKING
 const dualloop8_param_t kLoop8Test9[] = {
   make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
   make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
@@ -528,7 +644,42 @@ const dualloop8_param_t kLoop8Test9[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
                         ::testing::ValuesIn(kLoop8Test9));
+#endif
 #endif  // CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+#if CONFIG_HIGHBITDEPTH
+const dualloop8_param_t kHbdLoop8Test9Avx2[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param,
+                        ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+#endif
 #endif
 
 #if HAVE_NEON && (!CONFIG_PARALLEL_DEBLOCKING)
diff --git a/third_party/aom/test/minmax_test.cc b/third_party/aom/test/minmax_test.cc
index f82529192..aaac72c65 100644
--- a/third_party/aom/test/minmax_test.cc
+++ b/third_party/aom/test/minmax_test.cc
@@ -108,10 +108,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
       int min_ref, max_ref, min, max;
       reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
       ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
     }
   }
 }
diff --git a/third_party/aom/test/quantize_func_test.cc b/third_party/aom/test/quantize_func_test.cc
index 94dd056b4..2e4829021 100644
--- a/third_party/aom/test/quantize_func_test.cc
+++ b/third_party/aom/test/quantize_func_test.cc
@@ -157,8 +157,8 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
             << " Q: " << q << " coeff: " << coeff_ptr[j];
       }
 
-      ASSERT_EQ(eob[0], eob[1]) << "eobs mismatch on test: " << i
-                                << " Q: " << q;
+      ASSERT_EQ(eob[0], eob[1])
+          << "eobs mismatch on test: " << i << " Q: " << q;
     }
   }
 
diff --git a/third_party/aom/test/register_state_check.h b/third_party/aom/test/register_state_check.h
index 330820173..cce662a6d 100644
--- a/third_party/aom/test/register_state_check.h
+++ b/third_party/aom/test/register_state_check.h
@@ -49,7 +49,7 @@ namespace libaom_test {
 class RegisterStateCheck {
  public:
   RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
-  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+  ~RegisterStateCheck() { Check(); }
 
  private:
   static bool StoreRegisters(CONTEXT *const context) {
@@ -62,10 +62,10 @@ class RegisterStateCheck {
   }
 
   // Compares the register state. Returns true if the states match.
-  bool Check() const {
-    if (!initialized_) return false;
+  void Check() const {
+    ASSERT_TRUE(initialized_);
     CONTEXT post_context;
-    if (!StoreRegisters(&post_context)) return false;
+    ASSERT_TRUE(StoreRegisters(&post_context));
 
     const M128A *xmm_pre = &pre_context_.Xmm6;
     const M128A *xmm_post = &post_context.Xmm6;
@@ -74,7 +74,6 @@ class RegisterStateCheck {
       ++xmm_pre;
       ++xmm_post;
     }
-    return !testing::Test::HasNonfatalFailure();
   }
 
   bool initialized_;
@@ -105,7 +104,7 @@ namespace libaom_test {
 class RegisterStateCheck {
  public:
   RegisterStateCheck() { initialized_ = StoreRegisters(pre_store_); }
-  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+  ~RegisterStateCheck() { Check(); }
 
  private:
   static bool StoreRegisters(int64_t store[8]) {
@@ -114,15 +113,14 @@ class RegisterStateCheck {
   }
 
   // Compares the register state. Returns true if the states match.
-  bool Check() const {
-    if (!initialized_) return false;
+  void Check() const {
+    ASSERT_TRUE(initialized_);
     int64_t post_store[8];
     aom_push_neon(post_store);
     for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
-                                              << " has been modified";
+      EXPECT_EQ(pre_store_[i], post_store[i])
+          << "d" << i + 8 << " has been modified";
     }
-    return !testing::Test::HasNonfatalFailure();
   }
 
   bool initialized_;
@@ -159,12 +157,12 @@ class RegisterStateCheckMMX {
   RegisterStateCheckMMX() {
     __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
   }
-  ~RegisterStateCheckMMX() { EXPECT_TRUE(Check()); }
+  ~RegisterStateCheckMMX() { Check(); }
 
  private:
   // Checks the FPU tag word pre/post execution, returning false if not cleared
   // to 0xffff.
-  bool Check() const {
+  void Check() const {
     EXPECT_EQ(0xffff, pre_fpu_env_[4])
         << "FPU was in an inconsistent state prior to call";
 
@@ -172,7 +170,6 @@ class RegisterStateCheckMMX {
     __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
     EXPECT_EQ(0xffff, post_fpu_env[4])
         << "FPU was left in an inconsistent state after call";
-    return !testing::Test::HasNonfatalFailure();
   }
 
   uint16_t pre_fpu_env_[14];
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
index 802713d32..c4e924de0 100644
--- a/third_party/aom/test/resize_test.cc
+++ b/third_party/aom/test/resize_test.cc
@@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 0);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -351,11 +351,11 @@ class ResizeInternalTest : public ResizeTest {
         encoder->Config(&cfg_);
       }
     } else {
-      if (video->frame() == kStepDownFrame) {
+      if (video->frame() >= kStepDownFrame && video->frame() < kStepUpFrame) {
         struct aom_scaling_mode mode = { AOME_FOURFIVE, AOME_THREEFIVE };
         encoder->Control(AOME_SET_SCALEMODE, &mode);
       }
-      if (video->frame() == kStepUpFrame) {
+      if (video->frame() >= kStepUpFrame) {
         struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
         encoder->Control(AOME_SET_SCALEMODE, &mode);
       }
@@ -364,7 +364,7 @@ class ResizeInternalTest : public ResizeTest {
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5);
   }
 
 #if WRITE_COMPRESSED_STREAM
@@ -406,6 +406,9 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
+  }
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
     const aom_codec_pts_t pts = info->pts;
     if (pts >= kStepDownFrame && pts < kStepUpFrame) {
       ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
@@ -509,10 +512,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 1);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
     EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
@@ -520,7 +523,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
-TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 299);
   DefaultConfig();
@@ -558,7 +561,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Start at low target bitrate, raise the bitrate in the middle of the clip,
 // scaling-up should occur after bitrate changed.
-TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 359);
   DefaultConfig();
@@ -693,7 +696,11 @@ class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
   }
 };
 
+#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH)
+TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) {
+#else
 TEST_P(ResizeCspTest, TestResizeCspWorks) {
+#endif
   ResizingCspVideoSource video;
   init_flags_ = AOM_CODEC_USE_PSNR;
   cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
@@ -704,7 +711,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 AV1_INSTANTIATE_TEST_CASE(ResizeTest,
                           ::testing::Values(::libaom_test::kRealTime));
 AV1_INSTANTIATE_TEST_CASE(ResizeInternalTest,
-                          ::testing::Values(::libaom_test::kOnePassBest));
+                          ::testing::Values(::libaom_test::kOnePassGood));
 AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
                           ::testing::Values(::libaom_test::kRealTime),
                           ::testing::Range(5, 9));
diff --git a/third_party/aom/test/scan_test.cc b/third_party/aom/test/scan_test.cc
index 16c831c8e..2b11bd1fb 100644
--- a/third_party/aom/test/scan_test.cc
+++ b/third_party/aom/test/scan_test.cc
@@ -43,6 +43,7 @@ TEST(ScanTest, av1_augment_prob) {
   }
 }
 
+#if USE_TOPOLOGICAL_SORT
 TEST(ScanTest, av1_update_sort_order) {
   const TX_SIZE tx_size = TX_4X4;
   const TX_TYPE tx_type = DCT_DCT;
@@ -54,7 +55,9 @@ TEST(ScanTest, av1_update_sort_order) {
   av1_update_sort_order(tx_size, tx_type, prob, sort_order);
   for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
 }
+#endif
 
+#if USE_TOPOLOGICAL_SORT
 TEST(ScanTest, av1_update_scan_order) {
   TX_SIZE tx_size = TX_4X4;
   const TX_TYPE tx_type = DCT_DCT;
@@ -74,6 +77,7 @@ TEST(ScanTest, av1_update_scan_order) {
     EXPECT_EQ(i, scan[ref_iscan[i]]);
   }
 }
+#endif
 
 TEST(ScanTest, av1_update_neighbors) {
   TX_SIZE tx_size = TX_4X4;
@@ -94,4 +98,33 @@ TEST(ScanTest, av1_update_neighbors) {
   }
 }
 
+#if USE_2X2_PROB
+TEST(ScanTest, av1_down_sample_scan_count) {
+  const uint32_t non_zero_count[256] = {
+    13, 12, 11, 10, 0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 13, 9, 10, 8, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 11, 12, 9, 8, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  13, 9, 9, 10, 0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
+    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0,
+  };
+  const uint32_t ref_non_zero_count_ds[64] = {
+    13, 11, 0, 0, 0, 0, 0, 0, 11, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,  0,  0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,  0,  0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  uint32_t non_zero_count_ds[64];
+  av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, TX_16X16);
+  for (int i = 0; i < 64; ++i) {
+    EXPECT_EQ(ref_non_zero_count_ds[i], non_zero_count_ds[i]);
+  }
+}
+#endif
+
 }  // namespace
diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc
index 736e3f4c0..55ce1d5de 100644
--- a/third_party/aom/test/selfguided_filter_test.cc
+++ b/third_party/aom/test/selfguided_filter_test.cc
@@ -40,18 +40,25 @@ class AV1SelfguidedFilterTest
 
  protected:
   void RunSpeedTest() {
-    const int w = 256, h = 256;
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
     const int NUM_ITERS = 2000;
-    int i, j;
+    int i, j, k;
 
-    uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
-    uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        16, out_stride * (height + 32) * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & 0xFF;
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & 0xFF;
 
     int xqd[2] = {
       SGRPROJ_PRJ_MIN0 +
@@ -67,20 +74,30 @@ class AV1SelfguidedFilterTest
 
     std::clock_t start = std::clock();
     for (i = 0; i < NUM_ITERS; ++i) {
-      apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
+                                       output_p, out_stride, tmpbuf);
+        }
     }
     std::clock_t end = std::clock();
     double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
-           elapsed, elapsed * 1000000. / NUM_ITERS);
+    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+           height, elapsed, elapsed * 1000000. / NUM_ITERS);
 
-    aom_free(input);
-    aom_free(output);
+    aom_free(input_);
+    aom_free(output_);
     aom_free(tmpbuf);
   }
 
   void RunCorrectnessTest() {
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     // Set the maximum width/height to test here. We actually test a small
     // range of sizes *up to* this size, so that we can check, eg.,
     // the behaviour on tiles which are not a multiple of 4 wide.
@@ -88,21 +105,26 @@ class AV1SelfguidedFilterTest
     const int NUM_ITERS = 81;
     int i, j, k;
 
-    uint8_t *input =
-        (uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
-    uint8_t *output =
-        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
-    uint8_t *output2 =
-        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output2_ = (uint8_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
 
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+    uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     av1_loop_restoration_precal();
 
     for (i = 0; i < NUM_ITERS; ++i) {
-      for (j = 0; j < max_h; ++j)
-        for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & 0xFF;
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & 0xFF;
 
       int xqd[2] = {
         SGRPROJ_PRJ_MIN0 +
@@ -116,18 +138,33 @@ class AV1SelfguidedFilterTest
       int test_w = max_w - (i / 9);
       int test_h = max_h - (i % 9);
 
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          uint8_t *output2_p = output2 + k * out_stride + j;
+          apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
+                                       output_p, out_stride, tmpbuf);
+          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                         output2_p, out_stride, tmpbuf);
+        }
+      /*
       apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
                                    output, out_stride, tmpbuf);
       apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
                                      output2, out_stride, tmpbuf);
+                                     */
       for (j = 0; j < test_h; ++j)
-        for (k = 0; k < test_w; ++k)
+        for (k = 0; k < test_w; ++k) {
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+        }
     }
 
-    aom_free(input);
-    aom_free(output);
-    aom_free(output2);
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
     aom_free(tmpbuf);
   }
 };
@@ -135,9 +172,8 @@ class AV1SelfguidedFilterTest
 TEST_P(AV1SelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
 
-const FilterTestParam params[] = { make_tuple() };
-
 #if HAVE_SSE4_1
+const FilterTestParam params[] = { make_tuple() };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
                         ::testing::ValuesIn(params));
 #endif
@@ -156,20 +192,27 @@ class AV1HighbdSelfguidedFilterTest
 
  protected:
   void RunSpeedTest() {
-    const int w = 256, h = 256;
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
     const int NUM_ITERS = 2000;
-    int i, j;
+    int i, j, k;
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
-    uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        16, out_stride * (height + 32) * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & mask;
 
     int xqd[2] = {
       SGRPROJ_PRJ_MIN0 +
@@ -185,21 +228,31 @@ class AV1HighbdSelfguidedFilterTest
 
     std::clock_t start = std::clock();
     for (i = 0; i < NUM_ITERS; ++i) {
-      apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
-                                          output, w, tmpbuf);
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
+                                              eps, xqd, output_p, out_stride,
+                                              tmpbuf);
+        }
     }
     std::clock_t end = std::clock();
     double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
-           elapsed, elapsed * 1000000. / NUM_ITERS);
+    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+           height, elapsed, elapsed * 1000000. / NUM_ITERS);
 
-    aom_free(input);
-    aom_free(output);
+    aom_free(input_);
+    aom_free(output_);
     aom_free(tmpbuf);
   }
 
   void RunCorrectnessTest() {
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     // Set the maximum width/height to test here. We actually test a small
     // range of sizes *up to* this size, so that we can check, eg.,
     // the behaviour on tiles which are not a multiple of 4 wide.
@@ -209,21 +262,26 @@ class AV1HighbdSelfguidedFilterTest
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input =
-        (uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
-    uint16_t *output =
-        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
-    uint16_t *output2 =
-        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output2_ = (uint16_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
 
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+    uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     av1_loop_restoration_precal();
 
     for (i = 0; i < NUM_ITERS; ++i) {
-      for (j = 0; j < max_h; ++j)
-        for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & mask;
 
       int xqd[2] = {
         SGRPROJ_PRJ_MIN0 +
@@ -237,20 +295,37 @@ class AV1HighbdSelfguidedFilterTest
       int test_w = max_w - (i / 9);
       int test_h = max_h - (i % 9);
 
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          uint16_t *output2_p = output2 + k * out_stride + j;
+          apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
+                                              eps, xqd, output_p, out_stride,
+                                              tmpbuf);
+          apply_selfguided_restoration_highbd_c(input_p, w, h, stride,
+                                                bit_depth, eps, xqd, output2_p,
+                                                out_stride, tmpbuf);
+        }
+
+      /*
       apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
                                           bit_depth, eps, xqd, output,
                                           out_stride, tmpbuf);
       apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
                                             bit_depth, eps, xqd, output2,
                                             out_stride, tmpbuf);
+                                            */
       for (j = 0; j < test_h; ++j)
         for (k = 0; k < test_w; ++k)
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
     }
 
-    aom_free(input);
-    aom_free(output);
-    aom_free(output2);
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
     aom_free(tmpbuf);
   }
 };
@@ -258,10 +333,9 @@ class AV1HighbdSelfguidedFilterTest
 TEST_P(AV1HighbdSelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
 
+#if HAVE_SSE4_1
 const HighbdFilterTestParam highbd_params[] = { make_tuple(8), make_tuple(10),
                                                 make_tuple(12) };
-
-#if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdSelfguidedFilterTest,
                         ::testing::ValuesIn(highbd_params));
 #endif
diff --git a/third_party/aom/test/simd_cmp_impl.h b/third_party/aom/test/simd_cmp_impl.h
index 46f46d751..03fe703d9 100644
--- a/third_party/aom/test/simd_cmp_impl.h
+++ b/third_party/aom/test/simd_cmp_impl.h
@@ -371,10 +371,10 @@ typedef struct {
   fptr simd;
 } mapping;
 
-#define MAP(name)                                                      \
-  {                                                                    \
-    #name, reinterpret_cast < fptr > (c_##name),                       \
-                                      reinterpret_cast < fptr > (name) \
+#define MAP(name)                                                              \
+  {                                                                            \
+    #name,                                                                     \
+        reinterpret_cast < fptr > (c_##name), reinterpret_cast < fptr > (name) \
   }
 
 const mapping m[] = { MAP(v64_sad_u8),
diff --git a/third_party/aom/test/subtract_test.cc b/third_party/aom/test/subtract_test.cc
index ad39f56b3..725a6a2c6 100644
--- a/third_party/aom/test/subtract_test.cc
+++ b/third_party/aom/test/subtract_test.cc
@@ -130,7 +130,11 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
 
+#if CONFIG_EXT_PARTITION
     const size_t max_width = 128;
+#else
+    const size_t max_width = 64;
+#endif
     const size_t max_block_size = max_width * max_width;
     src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t))));
@@ -147,8 +151,8 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
   }
 
  protected:
-  void RunForSpeed();
   void CheckResult();
+  void RunForSpeed();
 
  private:
   ACMRandom rnd_;
@@ -161,27 +165,13 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
   int16_t *diff_;
 };
 
-void AV1HBDSubtractBlockTest::RunForSpeed() {
-  const int test_num = 200000;
-  const int max_width = 128;
-  const int max_block_size = max_width * max_width;
-  const int mask = (1 << bit_depth_) - 1;
-  int i, j;
-
-  for (j = 0; j < max_block_size; ++j) {
-    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
-    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
-  }
-
-  for (i = 0; i < test_num; ++i) {
-    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
-          pred_, block_width_, bit_depth_);
-  }
-}
-
 void AV1HBDSubtractBlockTest::CheckResult() {
   const int test_num = 100;
-  const int max_width = 128;
+#if CONFIG_EXT_PARTITION
+  const size_t max_width = 128;
+#else
+  const size_t max_width = 64;
+#endif
   const int max_block_size = max_width * max_width;
   const int mask = (1 << bit_depth_) - 1;
   int i, j;
@@ -208,9 +198,29 @@ void AV1HBDSubtractBlockTest::CheckResult() {
 
 TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
 
-#if USE_SPEED_TEST
-TEST_P(AV1HBDSubtractBlockTest, CheckSpeed) { RunForSpeed(); }
-#endif  // USE_SPEED_TEST
+void AV1HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+#if CONFIG_EXT_PARTITION
+  const size_t max_width = 128;
+#else
+  const size_t max_width = 64;
+#endif
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
 
 #if HAVE_SSE2
 
@@ -241,12 +251,14 @@ const Params kAV1HBDSubtractBlock_sse2[] = {
   make_tuple(64, 32, 12, &aom_highbd_subtract_block_c),
   make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(64, 64, 12, &aom_highbd_subtract_block_c),
+#if CONFIG_EXT_PARTITION
   make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(64, 128, 12, &aom_highbd_subtract_block_c),
   make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(128, 64, 12, &aom_highbd_subtract_block_c),
   make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
+#endif  // CONFIG_EXT_PARTITION
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, AV1HBDSubtractBlockTest,
diff --git a/third_party/aom/test/test-data.mk b/third_party/aom/test/test-data.mk
index 083b34953..d82033e3b 100644
--- a/third_party/aom/test/test-data.mk
+++ b/third_party/aom/test/test-data.mk
@@ -40,6 +40,10 @@ LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += tacomasmallcameramovement_640_480_30.y
 LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += thaloundeskmtg_640_480_30.yuv
 endif  # CONFIG_ENCODE_PERF_TESTS
 
+ifeq ($(CONFIG_EXT_TILE),yes)
+LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += vase10x10.yuv
+endif  # CONFIG_EXT_TILE
+
 # sort and remove duplicates
 LIBAOM_TEST_DATA-yes := $(sort $(LIBAOM_TEST_DATA-yes))
 
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
index 3d9bfc7c4..0caf21e1e 100644
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -26,3 +26,4 @@ e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
 717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
 5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
index a02f9203f..26937c96a 100644
--- a/third_party/aom/test/test.cmake
+++ b/third_party/aom/test/test.cmake
@@ -103,6 +103,7 @@ set(AOM_UNIT_TEST_ENCODER_SOURCES
     "${AOM_ROOT}/test/encode_test_driver.h"
     "${AOM_ROOT}/test/error_resilience_test.cc"
     "${AOM_ROOT}/test/i420_video_source.h"
+    "${AOM_ROOT}/test/resize_test.cc"
     "${AOM_ROOT}/test/y4m_test.cc"
     "${AOM_ROOT}/test/y4m_video_source.h"
     "${AOM_ROOT}/test/yuv_video_source.h")
@@ -133,24 +134,35 @@ if (NOT BUILD_SHARED_LIBS)
         "${AOM_ROOT}/test/av1_txfm_test.h"
         "${AOM_ROOT}/test/intrapred_test.cc"
         "${AOM_ROOT}/test/lpf_8_test.cc"
-        "${AOM_ROOT}/test/motion_vector_test.cc"
         "${AOM_ROOT}/test/simd_cmp_impl.h")
 
-    if (CONFIG_CDEF)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/clpf_test.cc"
-          "${AOM_ROOT}/test/dering_test.cc")
-    endif ()
+    set(AOM_UNIT_TEST_ENCODER_SOURCES
+        ${AOM_UNIT_TEST_ENCODER_SOURCES}
+        "${AOM_ROOT}/test/motion_vector_test.cc")
 
-    if (CONFIG_FILTER_INTRA)
-      if (HAVE_SSE4_1)
+    if (CONFIG_CDEF)
+      if (CONFIG_CDEF_SINGLEPASS)
+        set(AOM_UNIT_TEST_COMMON_SOURCES
+            ${AOM_UNIT_TEST_COMMON_SOURCES}
+            "${AOM_ROOT}/test/cdef_test.cc")
+      else ()
         set(AOM_UNIT_TEST_COMMON_SOURCES
             ${AOM_UNIT_TEST_COMMON_SOURCES}
-            "${AOM_ROOT}/test/filterintra_predictors_test.cc")
+            "${AOM_ROOT}/test/clpf_test.cc"
+            "${AOM_ROOT}/test/dering_test.cc")
       endif ()
     endif ()
 
+    # Omit 4-tap filter intra predictor test-- currently a 3-tap filter is in
+    # use.
+    #if (CONFIG_FILTER_INTRA)
+    #  if (HAVE_SSE4_1)
+    #    set(AOM_UNIT_TEST_COMMON_SOURCES
+    #        ${AOM_UNIT_TEST_COMMON_SOURCES}
+    #        "${AOM_ROOT}/test/filterintra_predictors_test.cc")
+    #  endif ()
+    #endif ()
+
     if (CONFIG_INTRABC)
         set(AOM_UNIT_TEST_COMMON_SOURCES
             ${AOM_UNIT_TEST_COMMON_SOURCES}
@@ -160,10 +172,15 @@ if (NOT BUILD_SHARED_LIBS)
     if (CONFIG_LOOP_RESTORATION)
       set(AOM_UNIT_TEST_COMMON_SOURCES
           ${AOM_UNIT_TEST_COMMON_SOURCES}
-           "${AOM_ROOT}/test/hiprec_convolve_test.cc"
+          "${AOM_ROOT}/test/selfguided_filter_test.cc")
+
+      if (HAVE_SSE2)
+        set(AOM_UNIT_TEST_COMMON_SOURCES
+            ${AOM_UNIT_TEST_COMMON_SOURCES}
+            "${AOM_ROOT}/test/hiprec_convolve_test.cc"
             "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
-            "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
-            "${AOM_ROOT}/test/selfguided_filter_test.cc")
+            "${AOM_ROOT}/test/hiprec_convolve_test_util.h")
+      endif ()
     endif ()
 
     set(AOM_UNIT_TEST_COMMON_INTRIN_NEON
@@ -202,11 +219,12 @@ if (CONFIG_AV1_ENCODER)
         "${AOM_ROOT}/test/av1_fht16x16_test.cc"
         "${AOM_ROOT}/test/av1_fht32x32_test.cc"
         "${AOM_ROOT}/test/av1_fht8x8_test.cc"
-        "${AOM_ROOT}/test/av1_inv_txfm_test.cc"
         "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
         "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
         "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
         "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+        "${AOM_ROOT}/test/av1_inv_txfm_test.cc"
+        "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
         "${AOM_ROOT}/test/avg_test.cc"
         "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
         "${AOM_ROOT}/test/blend_a64_mask_test.cc"
@@ -214,27 +232,37 @@ if (CONFIG_AV1_ENCODER)
         "${AOM_ROOT}/test/fdct4x4_test.cc"
         "${AOM_ROOT}/test/fdct8x8_test.cc"
         "${AOM_ROOT}/test/hadamard_test.cc"
+        "${AOM_ROOT}/test/masked_sad_test.cc"
+        "${AOM_ROOT}/test/masked_variance_test.cc"
         "${AOM_ROOT}/test/minmax_test.cc"
-        "${AOM_ROOT}/test/quantize_func_test.cc"
         "${AOM_ROOT}/test/subtract_test.cc"
         "${AOM_ROOT}/test/sum_squares_test.cc"
         "${AOM_ROOT}/test/variance_test.cc")
 
-    if (CONFIG_CONVOLVE_ROUND)
+    if (NOT CONFIG_AOM_QM AND NOT CONFIG_NEW_QUANT)
       set(AOM_UNIT_TEST_ENCODER_SOURCES
           ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
-          "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
-          "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
-          "${AOM_ROOT}/test/convolve_round_test.cc")
-      endif ()
+          "${AOM_ROOT}/test/quantize_func_test.cc")
+    endif ()
 
-    if (CONFIG_EXT_INTER)
+    if (CONFIG_CONVOLVE_ROUND)
       set(AOM_UNIT_TEST_ENCODER_SOURCES
           ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
-          "${AOM_ROOT}/test/masked_sad_test.cc"
-          "${AOM_ROOT}/test/masked_variance_test.cc")
+          "${AOM_ROOT}/test/convolve_round_test.cc")
+      if (HAVE_SSE2)
+        set(AOM_UNIT_TEST_ENCODER_SOURCES
+            ${AOM_UNIT_TEST_ENCODER_SOURCES}
+            "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
+            "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
+            "${AOM_ROOT}/test/av1_convolve_2d_test_util.h")
+      endif ()
+      if (NOT CONFIG_COMPOUND_ROUND)
+        if (HAVE_SSE4_1)
+          set(AOM_UNIT_TEST_ENCODER_SOURCES
+              ${AOM_UNIT_TEST_ENCODER_SOURCES}
+              "${AOM_ROOT}/test/av1_convolve_scale_test.cc")
+        endif ()
+      endif ()
     endif ()
 
     if (CONFIG_EXT_TX)
@@ -274,9 +302,9 @@ if (NOT BUILD_SHARED_LIBS)
   if (CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
     set(AOM_UNIT_TEST_COMMON_SOURCES
         ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/binary_codes_test.cc"
         "${AOM_ROOT}/test/divu_small_test.cc"
         "${AOM_ROOT}/test/ethread_test.cc"
+        "${AOM_ROOT}/test/coding_path_sync.cc"
         "${AOM_ROOT}/test/idct8x8_test.cc"
         "${AOM_ROOT}/test/partial_idct_test.cc"
         "${AOM_ROOT}/test/superframe_test.cc"
@@ -290,6 +318,7 @@ if (NOT BUILD_SHARED_LIBS)
     else ()
       set(AOM_UNIT_TEST_COMMON_SOURCES
           ${AOM_UNIT_TEST_COMMON_SOURCES}
+          "${AOM_ROOT}/test/binary_codes_test.cc"
           "${AOM_ROOT}/test/boolcoder_test.cc")
     endif ()
 
@@ -327,22 +356,25 @@ if (CONFIG_UNIT_TESTS)
     # Force static run time to avoid collisions with googletest.
     include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
   endif ()
-  include_directories(
-    "${AOM_ROOT}/third_party/googletest/src/googletest/src"
-    "${AOM_ROOT}/third_party/googletest/src/googletest/include")
 
   if (BUILD_SHARED_LIBS AND APPLE)
     # Silence an RPATH warning.
     set(CMAKE_MACOSX_RPATH 1)
   endif ()
-  add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
-                   EXCLUDE_FROM_ALL)
-
-  # Generate a stub file containing the C function usage_exit(); this is
-  # required because of the test dependency on aom_common_app_util.
-  # Specifically, the function die() in tools_common.c calls usage_exit() to
-  # terminate the program on the caller's behalf.
-  file(WRITE "${AOM_CONFIG_DIR}/usage_exit.c" "void usage_exit(void) {}")
+
+  include_directories(
+    "${AOM_ROOT}/third_party/googletest/src/googletest/src"
+    "${AOM_ROOT}/third_party/googletest/src/googletest/include")
+
+  if (AOM_DISABLE_GTEST_CMAKE)
+    include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
+    add_library(gtest STATIC
+      "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  else ()
+    add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
+                     EXCLUDE_FROM_ALL)
+  endif ()
+
 endif ()
 
 # Setup the targets for CONFIG_UNIT_TESTS. The libaom and app util targets must
@@ -364,6 +396,7 @@ function (setup_aom_test_targets)
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:test_aom_common>)
+  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} test_libaom)
 
   if (CONFIG_AV1_DECODER)
     target_sources(test_libaom PRIVATE
@@ -390,6 +423,7 @@ function (setup_aom_test_targets)
                      $<TARGET_OBJECTS:aom_common_app_util>)
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE}
                             aom gtest)
+      set(AOM_APP_TARGETS ${AOM_APP_TARGETS} test_intra_pred_speed)
     endif ()
   endif ()
 
@@ -483,6 +517,8 @@ function (setup_aom_test_targets)
   endforeach ()
   add_custom_target(runtests)
   add_dependencies(runtests ${test_targets})
+
+  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
 endfunction ()
 
 endif ()  # AOM_TEST_TEST_CMAKE_
diff --git a/third_party/aom/test/test.mk b/third_party/aom/test/test.mk
index 4132e4f74..e6b0c534c 100644
--- a/third_party/aom/test/test.mk
+++ b/third_party/aom/test/test.mk
@@ -33,11 +33,10 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += altref_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += aq_segment_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += datarate_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += encode_api_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += coding_path_sync.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += error_resilience_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += i420_video_source.h
 #LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += realtime_test.cc
-#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += resize_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += resize_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += y4m_video_source.h
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += yuv_video_source.h
 
@@ -107,6 +106,7 @@ ifeq ($(CONFIG_AV1),yes)
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_AV1_ENCODER)$(CONFIG_AV1_DECODER),yesyes)
 # IDCT test currently depends on FDCT function
+LIBAOM_TEST_SRCS-yes                   += coding_path_sync.cc
 LIBAOM_TEST_SRCS-yes                   += idct8x8_test.cc
 LIBAOM_TEST_SRCS-yes                   += partial_idct_test.cc
 LIBAOM_TEST_SRCS-yes                   += superframe_test.cc
@@ -135,8 +135,12 @@ endif
 LIBAOM_TEST_SRCS-$(CONFIG_ADAPT_SCAN)  += scan_test.cc
 LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
 LIBAOM_TEST_SRCS-yes                   += lpf_8_test.cc
+ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
+LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += cdef_test.cc
+else
 LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += dering_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += clpf_test.cc
+endif
 LIBAOM_TEST_SRCS-yes                   += simd_cmp_impl.h
 LIBAOM_TEST_SRCS-$(HAVE_SSE2)          += simd_cmp_sse2.cc
 LIBAOM_TEST_SRCS-$(HAVE_SSSE3)         += simd_cmp_ssse3.cc
@@ -163,11 +167,9 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += error_block_test.cc
 #LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_quantize_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += arf_freq_test.cc
-ifneq ($(CONFIG_AOM_QM), yes)
 ifneq ($(CONFIG_NEW_QUANT), yes)
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += quantize_func_test.cc
 endif
-endif
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += block_error_test.cc
 
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_inv_txfm_test.cc
@@ -193,11 +195,9 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_1d_test.cc
 
-ifeq ($(CONFIG_EXT_INTER),yes)
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += masked_variance_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += masked_sad_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_wedge_utils_test.cc
-endif
 
 ## Skip the unit test written for 4-tap filter intra predictor, because we
 ## revert to 3-tap filter.
@@ -252,6 +252,10 @@ LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.cc
 LIBAOM_TEST_SRCS-yes          += convolve_round_test.cc
 endif
 
+ifeq (yesx,$(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND))
+LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_convolve_scale_test.cc
+endif
+
 ifeq ($(CONFIG_GLOBAL_MOTION)$(CONFIG_AV1_ENCODER),yesyes)
 LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += corner_match_test.cc
 endif
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
index e4641049d..3904734b5 100644
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -9,6 +9,47 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
+set(AOM_TEST_DATA_FILE_NAMES
+    "hantro_collage_w352h288.yuv"
+    "hantro_odd.yuv"
+    "park_joy_90p_10_420.y4m"
+    "park_joy_90p_10_422.y4m"
+    "park_joy_90p_10_444.y4m"
+    "park_joy_90p_10_440.yuv"
+    "park_joy_90p_12_420.y4m"
+    "park_joy_90p_12_422.y4m"
+    "park_joy_90p_12_444.y4m"
+    "park_joy_90p_12_440.yuv"
+    "park_joy_90p_8_420_a10-1.y4m"
+    "park_joy_90p_8_420.y4m"
+    "park_joy_90p_8_422.y4m"
+    "park_joy_90p_8_444.y4m"
+    "park_joy_90p_8_440.yuv"
+    "desktop_credits.y4m"
+    "niklas_1280_720_30.y4m"
+    "rush_hour_444.y4m"
+    "screendata.y4m"
+    "niklas_640_480_30.yuv")
+
+if (CONFIG_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  set(AOM_TEST_DATA_FILE_NAMES
+      ${AOM_TEST_DATA_FILE_NAMES}
+      "niklas_1280_720_30.yuv")
+endif ()
+
+if (CONFIG_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  set(AOM_TEST_DATA_FILE_NAMES
+      ${AOM_TEST_DATA_FILE_NAMES}
+      "desktop_640_360_30.yuv"
+      "kirland_640_480_30.yuv"
+      "macmarcomoving_640_480_30.yuv"
+      "macmarcostationary_640_480_30.yuv"
+      "niklas_1280_720_30.yuv"
+      "tacomanarrows_640_480_30.yuv"
+      "tacomasmallcameramovement_640_480_30.yuv"
+      "thaloundeskmtg_640_480_30.yuv")
+endif ()
+
 # Parses test/test-data.sha1 and writes captured file names and checksums to
 # $out_files and $out_checksums as lists.
 function (make_test_data_lists test_data_file out_files out_checksums)
@@ -28,8 +69,12 @@ function (make_test_data_lists test_data_file out_files out_checksums)
     string(SUBSTRING "${line}" 0 ${delim_pos} checksum)
     string(SUBSTRING "${line}" ${filename_pos} -1 filename)
 
-    set(checksums ${checksums} ${checksum})
-    set(filenames ${filenames} ${filename})
+    list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index)
+    if (NOT ${list_index} EQUAL -1)
+      # Include the name and checksum in output only when the file is needed.
+      set(checksums ${checksums} ${checksum})
+      set(filenames ${filenames} ${filename})
+    endif ()
   endforeach ()
 
   list(LENGTH filenames num_files)
diff --git a/third_party/aom/test/test_intra_pred_speed.cc b/third_party/aom/test/test_intra_pred_speed.cc
index 70d82484c..25289446f 100644
--- a/third_party/aom/test/test_intra_pred_speed.cc
+++ b/third_party/aom/test/test_intra_pred_speed.cc
@@ -31,199 +31,356 @@ namespace {
 typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
                             const uint8_t *above, const uint8_t *left);
 
+const int kBPS = 32;
+const int kTotalPixels = kBPS * kBPS;
 const int kNumAv1IntraFuncs = INTRA_MODES + 3;  // 4 DC predictor variants.
 const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = {
   "DC_PRED",       "DC_LEFT_PRED",  "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
   "H_PRED",        "D45_PRED",      "D135_PRED",   "D117_PRED",   "D153_PRED",
-  "D207_PRED",     "D63_PRED",      "TM_PRED",
-#if CONFIG_ALT_INTRA
-  "SMOOTH_PRED",
+  "D207_PRED",     "D63_PRED",      "TM_PRED",     "SMOOTH_PRED",
 #if CONFIG_SMOOTH_HV
   "SMOOTH_V_PRED", "SMOOTH_H_PRED",
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
 };
 
+template <typename Pixel>
+struct IntraPredTestMem {
+  void Init(int block_width, int bd) {
+    libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
+    Pixel *const above = above_mem + 16;
+    const int mask = (1 << bd) - 1;
+    for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
+
+    ASSERT_LE(block_width, kBPS);
+    for (int i = kBPS; i < 2 * kBPS; ++i) {
+      left[i] = rnd.Rand16() & mask;
+      above[i] = rnd.Rand16() & mask;
+    }
+  }
+
+  DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]);
+  DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+// -----------------------------------------------------------------------------
+// Low Bittdepth
+
+typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
+
+// Note:
+// APPLY_UNIT_TESTS
+// 1: Do unit tests
+// 0: Generate MD5 array as required
+#define APPLY_UNIT_TESTS 1
+
+void CheckMd5Signature(const char name[], const char *const signatures[],
+                       const void *data, size_t data_size, int elapsed_time,
+                       int idx) {
+  libaom_test::MD5 md5;
+  md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+#if APPLY_UNIT_TESTS
+  printf("Mode %s[%13s]: %5d ms     MD5: %s\n", name, kAv1IntraPredNames[idx],
+         elapsed_time, md5.Get());
+  EXPECT_STREQ(signatures[idx], md5.Get());
+#else
+  printf("\"%s\",\n", md5.Get());
+#endif
+}
+
 void TestIntraPred(const char name[], AvxPredFunc const *pred_funcs,
-                   const char *const pred_func_names[], int num_funcs,
-                   const char *const signatures[], int /*block_size*/,
-                   int num_pixels_per_test) {
-  libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
-  const int kBPS = 32;
-  const int kTotalPixels = 32 * kBPS;
-  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, left[2 * kBPS]);
-  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
-  uint8_t *const above = above_mem + 16;
-  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
-  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
-  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
+                   const char *const signatures[], int block_width,
+                   int block_height) {
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
   const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1IntraPredTestMem intra_pred_test_mem;
+  const uint8_t *const above = intra_pred_test_mem.above_mem + 16;
 
-  // Fill up bottom-left and top-right pixels.
-  for (int i = kBPS; i < 2 * kBPS; ++i) {
-    left[i] = rnd.Rand8();
-    above[i] = rnd.Rand8();
-  }
+  intra_pred_test_mem.Init(block_width, 8);
 
-  for (int k = 0; k < num_funcs; ++k) {
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
     if (pred_funcs[k] == NULL) continue;
-    memcpy(src, ref_src, sizeof(src));
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](src, kBPS, above, left);
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left);
     }
     libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
-    libaom_test::MD5 md5;
-    md5.Add(src, sizeof(src));
-    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
-           elapsed_time, md5.Get());
-    EXPECT_STREQ(signatures[k], md5.Get());
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
   }
 }
 
-void TestIntraPred4(AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
-    "4334156168b34ab599d9b5b30f522fe9",
-    "bc4649d5ba47c7ff178d92e475960fb0",
-    "8d316e5933326dcac24e1064794b5d12",
-    "a27270fed024eafd762c95de85f4da51",
-    "c33dff000d4256c2b8f3bf9e9bab14d2",
-    "44d8cddc2ad8f79b8ed3306051722b4f",
-    "df62e96dfcb25d8a435482756a6fa990",
-    "ecb0d56ae5f677ea45127ce9d5c058e4",
-    "0b7936841f6813da818275944895b574",
-    "9117972ef64f91a58ff73e1731c81db2",
-    "46d493dccf6e5356c6f3c0c73b7dd141",
-    "b852f42e6c4991d415400332d567872f",
-#if CONFIG_ALT_INTRA
-    "828c49a4248993cce4876fa26eab697f",
-    "718c8cee9011f92ef31f77a9a7560010",
-#if CONFIG_SMOOTH_HV
-    "b37eeadbbd9e3bdff023a5097b59213a",
-    "d6fb9c659d82c78f0d0c891da6cba87f",
+void TestIntraPred4(const char *block_name, AvxPredFunc const *pred_funcs) {
+  static const char *const kSignatures4x4[kNumAv1IntraFuncs] = {
+    "e7ed7353c3383fff942e500e9bfe82fe",
+    "2a4a26fcc6ce005eadc08354d196c8a9",
+    "269d92eff86f315d9c38fe7640d85b15",
+    "ae2960eea9f71ee3dabe08b282ec1773",
+    "6c1abcc44e90148998b51acd11144e9c",
+    "f7bb3186e1ef8a2b326037ff898cad8e",
+    "87e72798518d62e84bcc77dcb17d0f3b",
+    "141624072a4a56773f68fadbdd07c4a7",
+    "7be49b08687a5f24df3a2c612fca3876",
+    "459bb5d9fd5b238348179c9a22108cd6",
+    "3d98810f418a9de92acfe2c68909c61c",
+    "6310eecda3cc9496987ca10186255558",
+    "59fc0e923a08cfac0a493fb38988e2bb",
+    "9ff8bb37d9c830e6ab8ecb0c435d3c91",
+#if CONFIG_SMOOTH_HV
+    "de6937fca02354f2874dbc5dbec5d5b3",
+    "723cf948137f7d8c7860d814e55ae67d",
 #endif  // CONFIG_SMOOTH_HV
-#else
-    "309a618577b27c648f9c5ee45252bc8f",
-#endif  // CONFIG_ALT_INTRA
   };
-  TestIntraPred("Intra4", pred_funcs, kAv1IntraPredNames, kNumAv1IntraFuncs,
-                kSignatures, 4, 4 * 4 * kNumAv1IntraFuncs);
+  static const char *const kSignatures4x8[kNumAv1IntraFuncs] = {
+    "d9fbebdc85f71ab1e18461b2db4a2adc",
+    "5ccb2a68284bc9714d94b8a06ccadbb2",
+    "735d059abc2744f3ff3f9590f7191b37",
+    "d9fbebdc85f71ab1e18461b2db4a2adc",
+    "6819497c44cd0ace120add83672996ee",
+    "7e3244f5a2d3edf81c7e962a842b97f9",
+    "3fa52ee9acf5a25594cac684be263f32",
+    "c18dd23d57def4df4c6147c572dfc827",
+    "d007fbf7e43cb8f49702daa20f0c9153",
+    "5c0226c44c5df285728296b80cc6de4b",
+    "b55d7b558bebc8c2042dfac58b3c4688",
+    "6549362baa389b8faa2d954926b64e2f",
+    "809350f164cd4d1650850bb0f59c3260",
+    "1b60a394331eeab6927a6f8aaff57040",
+#if CONFIG_SMOOTH_HV
+    "5307de1bd7329ba6b281d2c1b0b457f9",
+    "24c58a8138339846d95568efb91751db",
+#endif
+  };
+  if (!strcmp(block_name, "intra4x4")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures4x4, 4, 4);
+  }
+  if (!strcmp(block_name, "intra4x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures4x8, 4, 8);
+  }
 }
 
-void TestIntraPred8(AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
-    "7694ddeeefed887faf9d339d18850928",
-    "7d726b1213591b99f736be6dec65065b",
-    "19c5711281357a485591aaf9c96c0a67",
-    "ba6b66877a089e71cd938e3b8c40caac",
-    "802440c93317e0f8ba93fab02ef74265",
-    "9e09a47a15deb0b9d8372824f9805080",
-    "a2fd4b66e1a667a3e582588934a7e4bd",
-    "78339c1c60bb1d67d248ab8c4da08b7f",
-    "5c97d70f7d47de1882a6cd86c165c8a9",
-    "8182bf60688b42205acd95e59e967157",
-    "9d69fcaf12398e67242d3fcf5cf2267e",
-    "7a09adb0fa6c2bf889a99dd816622feb",
-#if CONFIG_ALT_INTRA
-    "f6ade499c626d38eb70661184b79bc57",
-    "1ad5b106c79b792e514ba25e87139b5e",
-#if CONFIG_SMOOTH_HV
-    "fe0d359b91a1d8141483d2e032f1b75f",
-    "0cfd7603ced02829d1ce18b6795d73d0",
+void TestIntraPred8(const char *block_name, AvxPredFunc const *pred_funcs) {
+  static const char *const kSignatures8x8[kNumAv1IntraFuncs] = {
+    "d8bbae5d6547cfc17e4f5f44c8730e88",
+    "373bab6d931868d41a601d9d88ce9ac3",
+    "6fdd5ff4ff79656c14747598ca9e3706",
+    "d9661c2811d6a73674f40ffb2b841847",
+    "7c722d10b19ccff0b8c171868e747385",
+    "f81dd986eb2b50f750d3a7da716b7e27",
+    "e0b1292448f3350bf1c92ca283ca872a",
+    "0e3523f9cab2142dd37fd07ec0760bce",
+    "79ac4efe907f0a0f1885d43066cfedee",
+    "19ecf2432ac305057de3b6578474eec6",
+    "7ae38292cbe47b4aa0807c3bd5a543df",
+    "d0ecffec1bb01f4b61ab5738164695c4",
+    "064404361748dd111a890a1470d7f0ea",
+    "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
+#if CONFIG_SMOOTH_HV
+    "97111eb1bc26bade6272015df829f1ae",
+    "d19a8a73cc46b807f2c5e817576cc1e1",
 #endif  // CONFIG_SMOOTH_HV
-#else
-    "815b75c8e0d91cc1ae766dc5d3e445a3",
-#endif  // CONFIG_ALT_INTRA
   };
-  TestIntraPred("Intra8", pred_funcs, kAv1IntraPredNames, kNumAv1IntraFuncs,
-                kSignatures, 8, 8 * 8 * kNumAv1IntraFuncs);
+  static const char *const kSignatures8x4[kNumAv1IntraFuncs] = {
+    "23f9fc11344426c9bee2e06d57dfd628",
+    "2d71a26d1bae1fb34734de7b42fc5eb7",
+    "5af9c1b2fd9d5721fad67b67b3f7c816",
+    "00d71b17be662753813d515f197d145e",
+    "bef10ec984427e28f4390f43809d10af",
+    "77773cdfb7ed6bc882ab202a64b0a470",
+    "cba356970f6b9a1b6024e1dbe4a66f9b",
+    "c58c21efc804242848e6f29a93a7984d",
+    "dc92cc45a51c7a397506cab19f74e66d",
+    "391f6a12224f81a3719ea09a2cf7a5ad",
+    "b74b8b11f7eb2bbf723b25f381104ca9",
+    "2234aaa06ca245624211cf53a0261017",
+    "2cc48bd66d6b0121b5221d52ccd732af",
+    "b302155e1c9eeeafe2ba2bf68e807a46",
+#if CONFIG_SMOOTH_HV
+    "561bc8d0e76d5041ebd5168fc6a115e1",
+    "81d0113fb1d0a9a24ffd6f1987b77948",
+#endif
+  };
+  static const char *const kSignatures8x16[kNumAv1IntraFuncs] = {
+    "c849de88b24f773dfcdd1d48d1209796",
+    "6cb807c1897b94866a0f3d3c56ed8695",
+    "d56db05a8ac7981762f5b877f486c4ef",
+    "b4bc01eb6e59a40922ad17715cafb04b",
+    "09d178439534f4062ae687c351f66d64",
+    "644501399cf73080ac606e5cef7ca09b",
+    "0e8e968fa177204d7e73d7e04ce69ebb",
+    "1d25f9287fdf7ba48a5105f1529b7e75",
+    "02cacccf3752451763a6a6e2e784494f",
+    "6044a1416d53e324ddc012d2e7763339",
+    "57ac6e8f3ab5e943c9280043eeb174b8",
+    "d51b9d65471194d9caebc7d67e75ef10",
+    "278076495180e17c065a95ab7278539a",
+    "9dd7f324816f242be408ffeb0c673732",
+#if CONFIG_SMOOTH_HV
+    "f520c4a20acfa0bea1d253c6f0f040fd",
+    "85f38df809df2c2d7c8b4a157a65cd44",
+#endif
+  };
+  if (!strcmp(block_name, "intra8x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x8, 8, 8);
+  }
+  if (!strcmp(block_name, "intra8x4")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x4, 8, 4);
+  }
+  if (!strcmp(block_name, "intra8x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x16, 8, 16);
+  }
 }
 
-void TestIntraPred16(AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
-    "b40dbb555d5d16a043dc361e6694fe53",
-    "fb08118cee3b6405d64c1fd68be878c6",
-    "6c190f341475c837cc38c2e566b64875",
-    "db5c34ccbe2c7f595d9b08b0dc2c698c",
-    "a62cbfd153a1f0b9fed13e62b8408a7a",
-    "143df5b4c89335e281103f610f5052e4",
-    "404944b521d16f6edd160feeeb31ff35",
-    "7841fae7d4d47b519322e6a03eeed9dc",
-    "f6ebed3f71cbcf8d6d0516ce87e11093",
-    "3cc480297dbfeed01a1c2d78dd03d0c5",
-    "fbd607f15da218c5390a5b183b634a10",
-    "f7063ccbc29f87303d5c3d0555b08944",
-#if CONFIG_ALT_INTRA
-    "7adcaaa3554eb71a81fc48cb9043984b",
-    "c0acea4397c1b4d54a21bbcec5731dff",
-#if CONFIG_SMOOTH_HV
-    "f15b8712f0f064e98a7d804d3074afa7",
-    "01a09cdb8edd06d840c84643032fc02f",
+void TestIntraPred16(const char *block_name, AvxPredFunc const *pred_funcs) {
+  static const char *const kSignatures16x16[kNumAv1IntraFuncs] = {
+    "50971c07ce26977d30298538fffec619",
+    "527a6b9e0dc5b21b98cf276305432bef",
+    "7eff2868f80ebc2c43a4f367281d80f7",
+    "67cd60512b54964ef6aff1bd4816d922",
+    "48371c87dc95c08a33b2048f89cf6468",
+    "b0acf2872ee411d7530af6d2625a7084",
+    "31d901ab2289d1e61e704e40240382a7",
+    "dae208f3dca583529cff49b73f7c4183",
+    "7af66a2f4c8e0b4908e40f047e60c47c",
+    "125e3ab6ab9bc961f183ec366a7afa88",
+    "ff230677e800977757d14b85a9eba404",
+    "eb42dc39140515dd4f3ab1afe6c3e71b",
+    "93d6b5352b571805ab16a55e1bbed86a",
+    "03764e4c0aebbc180e4e2c68fb06df2b",
+#if CONFIG_SMOOTH_HV
+    "bb6c74c9076c9f266ab11fb57060d8e6",
+    "0c5162bc28489756ddb847b5678e6f07",
 #endif  // CONFIG_SMOOTH_HV
-#else
-    "b8a41aa968ec108af447af4217cba91b",
-#endif  // CONFIG_ALT_INTRA
   };
-  TestIntraPred("Intra16", pred_funcs, kAv1IntraPredNames, kNumAv1IntraFuncs,
-                kSignatures, 16, 16 * 16 * kNumAv1IntraFuncs);
+  static const char *const kSignatures16x8[kNumAv1IntraFuncs] = {
+    "b4cbdbdf10ce13300b4063a3daf99e04",
+    "3731e1e6202064a9d0604d7c293ecee4",
+    "6c856188c4256a06452f0d5d70cac436",
+    "1f2192b4c8c497589484ea7bf9c944e8",
+    "84011bd4b7f565119d06787840e333a0",
+    "0e48949f7a6aa36f0d76b5d01f91124a",
+    "58114c06f6b9d8285e5020c7afd834ab",
+    "e37afe84a8b3c5e0f048d4652ecbe09e",
+    "c216348473fb029b45f8fb4f2862a7bd",
+    "0b7385155dcef742cc456d5741ae93a3",
+    "d55fadb221f0ea20266e57cd413e7b94",
+    "9bd6eb226c7e169b8d53cf70aea98b3a",
+    "60eff8064634b6c73b10681356baeee9",
+    "1559aeb081a9c0c71111d6093c2ff9fd",
+#if CONFIG_SMOOTH_HV
+    "c15479b739713773e5cabb748451987b",
+    "72e33ec12c9b67aea26d8d005fb82de2",
+#endif
+  };
+  static const char *const kSignatures16x32[kNumAv1IntraFuncs] = {
+    "abe5233d189cdbf79424721571bbaa7b",
+    "282759f81e3cfb2e2d396fe406b72a8b",
+    "e2224926c264f6f174cbc3167a233168",
+    "6814e85c2b33f8c9415d62e80394b47b",
+    "99cbbb60459c08a3061d72c4e4f6276a",
+    "1d1567d40b8e816f8c1f71e576fe0f87",
+    "5e989f9c748a0d2cd8c4ebf9d3fe1278",
+    "7135a2f419452a3a192a35156f68b019",
+    "06e10af5a726d2c81b8f8c708204f9fb",
+    "c0882f0e7ba1ffa0aeef6d5c751df6de",
+    "8477429e17d39a423f30e2082f651549",
+    "ba35068a30c2d1d10901e4bfabd02a11",
+    "36fdd371b624a075814d497c4832ec85",
+    "8ab8da61b727442b6ff692b40d0df018",
+#if CONFIG_SMOOTH_HV
+    "e35a10ad7fdf2327e821504a90f6a6eb",
+    "1f7211e727dc1de7d6a55d082fbdd821",
+#endif
+  };
+  if (!strcmp(block_name, "intra16x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x16, 16, 16);
+  }
+  if (!strcmp(block_name, "intra16x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x8, 16, 8);
+  }
+  if (!strcmp(block_name, "intra16x32")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x32, 16, 32);
+  }
 }
 
-void TestIntraPred32(AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
-    "558541656d84f9ae7896db655826febe",
-    "b3587a1f9a01495fa38c8cd3c8e2a1bf",
-    "4c6501e64f25aacc55a2a16c7e8f0255",
+void TestIntraPred32(const char *block_name, AvxPredFunc const *pred_funcs) {
+  static const char *const kSignatures32x32[kNumAv1IntraFuncs] = {
+    "a0a618c900e65ae521ccc8af789729f2",
+    "985aaa7c72b4a6c2fb431d32100cf13a",
+    "10662d09febc3ca13ee4e700120daeb5",
     "b3b01379ba08916ef6b1b35f7d9ad51c",
-    "0f1eb38b6cbddb3d496199ef9f329071",
-    "911c06efb9ed1c3b4c104b232b55812f",
-    "b4f9f177a8a259514f039cfb403a61e3",
-    "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
-    "b01c9076525216925f3456f034fb6eee",
-    "d267e20ad9e5cd2915d1a47254d3d149",
-    "3c45418137114cb6cef4c7a7baf4855c",
-    "d520125ebd512c63c301bf67fea8e059",
-#if CONFIG_ALT_INTRA
-    "297e8fbb5d33c29b12b228fa9d7c40a4",
-    "31b9296d70dd82238c87173e6d5e65fd",
-#if CONFIG_SMOOTH_HV
-    "f1041f77a34e86aaf30ea779ba84a2e8",
-    "83e2b744a6a3d82321744442b1db945c",
+    "9f4261755795af97e34679c333ec7004",
+    "bc2c9da91ad97ef0d1610fb0a9041657",
+    "f524b1a7e31c7bb9bfb2487fac3e16d8",
+    "4039bb7da0f6860090d3c57b5c85468f",
+    "b29fff7b61804e68383e3a609b33da58",
+    "e1aa5e49067fd8dba66c2eb8d07b7a89",
+    "db217e7891581cf93895ef5974bebb21",
+    "beb6cdc52b52c8976b4d2407ec8d2313",
+    "ef1653982b69e1f64bee3759f3e1ec45",
+    "1a51a675deba2c83282142eb48d3dc3d",
+#if CONFIG_SMOOTH_HV
+    "866c224746dc260cda861a7b1b383fb3",
+    "cea23799fc3526e1b6a6ff02b42b82af",
 #endif  // CONFIG_SMOOTH_HV
-#else
-    "9e1370c6d42e08d357d9612c93a71cfc",
-#endif  // CONFIG_ALT_INTRA
   };
-  TestIntraPred("Intra32", pred_funcs, kAv1IntraPredNames, kNumAv1IntraFuncs,
-                kSignatures, 32, 32 * 32 * kNumAv1IntraFuncs);
+  static const char *const kSignatures32x16[kNumAv1IntraFuncs] = {
+    "d1aeb8d5fdcfd3307922af01a798a4dc",
+    "b0bcb514ebfbee065faea9d34c12ae75",
+    "d6a18c63b4e909871c0137ca652fad23",
+    "fd047f2fc1b8ffb95d0eeef3e8796a45",
+    "645ab60779ea348fd93c81561c31bab9",
+    "4409633c9db8dff41ade4292a3a56e7f",
+    "b9b2935b2287a9a461ac5c11251ac706",
+    "43b05f808c0ac4fe8accd84d293b0488",
+    "1d2cb43872d20c205ffb185102bcd22a",
+    "2c1551b5e99592fd21053b5d14e397d9",
+    "cd499ef0dd41e2e38d5dac3319dfdd97",
+    "cd2610426637003f3b5d3984cb3320d5",
+    "5e36a11e069b31c2a739f3a9c7b37c24",
+    "e83b9483d702cfae496991c3c7fa92c0",
+#if CONFIG_SMOOTH_HV
+    "12f6ddf98c7f30a277307f1ea935b030",
+    "354321d6c32bbdb0739e4fa2acbf41e1",
+#endif
+  };
+  if (!strcmp(block_name, "intra32x32")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures32x32, 32, 32);
+  }
+  if (!strcmp(block_name, "intra32x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures32x16, 32, 16);
+  }
 }
 
 }  // namespace
 
 // Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
 // to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
-#define INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, v, h, \
-                        d45e, d135, d117, d153, d207e, d63e, tm, smooth,    \
-                        smooth_v, smooth_h)                                 \
-  TEST(arch, test_func) {                                                   \
-    static const AvxPredFunc aom_intra_pred[] = {                           \
-      dc,   dc_left, dc_top, dc_128, v,  h,      d45e,     d135,            \
-      d117, d153,    d207e,  d63e,   tm, smooth, smooth_v, smooth_h         \
-    };                                                                      \
-    test_func(aom_intra_pred);                                              \
+#define INTRA_PRED_TEST(arch, test_func, blk, dc, dc_left, dc_top, dc_128, v, \
+                        h, d45e, d135, d117, d153, d207e, d63e, tm, smooth,   \
+                        smooth_v, smooth_h)                                   \
+  TEST(arch, DISABLED_##test_func) {                                          \
+    static const AvxPredFunc aom_intra_pred[] = {                             \
+      dc,   dc_left, dc_top, dc_128, v,  h,      d45e,     d135,              \
+      d117, d153,    d207e,  d63e,   tm, smooth, smooth_v, smooth_h           \
+    };                                                                        \
+    test_func(blk, aom_intra_pred);                                           \
   }
 
 // -----------------------------------------------------------------------------
 // 4x4
 
-#if CONFIG_ALT_INTRA
-#define tm_pred_func aom_paeth_predictor_4x4_c
-#define smooth_pred_func aom_smooth_predictor_4x4_c
 #if CONFIG_SMOOTH_HV
 #define smooth_v_pred_func aom_smooth_v_predictor_4x4_c
 #define smooth_h_pred_func aom_smooth_h_predictor_4x4_c
@@ -231,95 +388,91 @@ void TestIntraPred32(AvxPredFunc const *pred_funcs) {
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-#else
-#define tm_pred_func aom_tm_predictor_4x4_c
-#define smooth_pred_func NULL
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_ALT_INTRA
 
-INTRA_PRED_TEST(C, TestIntraPred4, aom_dc_predictor_4x4_c,
+INTRA_PRED_TEST(C_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_c,
                 aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
                 aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
                 aom_h_predictor_4x4_c, aom_d45e_predictor_4x4_c,
                 aom_d135_predictor_4x4_c, aom_d117_predictor_4x4_c,
                 aom_d153_predictor_4x4_c, aom_d207e_predictor_4x4_c,
-                aom_d63e_predictor_4x4_c, tm_pred_func, smooth_pred_func,
-                smooth_v_pred_func, smooth_h_pred_func)
+                aom_d63e_predictor_4x4_c, aom_paeth_predictor_4x4_c,
+                aom_smooth_predictor_4x4_c, smooth_v_pred_func,
+                smooth_h_pred_func)
 
-#undef tm_pred_func
-#undef smooth_pred_func
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
-#if HAVE_SSE2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_4x8_c
+#define smooth_h_pred_func aom_smooth_h_predictor_4x8_c
 #else
-#define tm_pred_func aom_tm_predictor_4x4_sse2
-#endif  // CONFIG_ALT_INTRA
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+INTRA_PRED_TEST(C_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_c,
+                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
+                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
+                aom_h_predictor_4x8_c, aom_d45e_predictor_4x8_c,
+                aom_d135_predictor_4x8_c, aom_d117_predictor_4x8_c,
+                aom_d153_predictor_4x8_c, aom_d207e_predictor_4x8_c,
+                aom_d63e_predictor_4x8_c, aom_paeth_predictor_4x8_c,
+                aom_smooth_predictor_4x8_c, smooth_v_pred_func,
+                smooth_h_pred_func)
 
-INTRA_PRED_TEST(SSE2, TestIntraPred4, aom_dc_predictor_4x4_sse2,
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_sse2,
                 aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
                 aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
                 aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_sse2,
+                aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
+                aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
+                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_d153_predictor_4x4_ssse3, NULL,
-                aom_d63e_predictor_4x4_ssse3, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_1, TestIntraPred4, "intra4x4", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_4x4_ssse3,
+                NULL, aom_d63e_predictor_4x4_ssse3,
+                aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
+                NULL, NULL)
+INTRA_PRED_TEST(SSSE3_2, TestIntraPred4, "intra4x8", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
+                NULL, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_4x4_dspr2
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, aom_dc_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_dspr2,
+                NULL, NULL, NULL, NULL, aom_h_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_4x4_neon
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(NEON, TestIntraPred4, aom_dc_predictor_4x4_neon,
+INTRA_PRED_TEST(NEON, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_neon,
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
                 aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
                 aom_h_predictor_4x4_neon, NULL, aom_d135_predictor_4x4_neon,
-                NULL, NULL, NULL, NULL, tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_4x4_msa
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(MSA, TestIntraPred4, aom_dc_predictor_4x4_msa,
+INTRA_PRED_TEST(MSA, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_msa,
                 aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
                 aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
                 aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 8x8
 
-#if CONFIG_ALT_INTRA
-#define tm_pred_func aom_paeth_predictor_8x8_c
-#define smooth_pred_func aom_smooth_predictor_8x8_c
 #if CONFIG_SMOOTH_HV
 #define smooth_v_pred_func aom_smooth_v_predictor_8x8_c
 #define smooth_h_pred_func aom_smooth_h_predictor_8x8_c
@@ -327,91 +480,114 @@ INTRA_PRED_TEST(MSA, TestIntraPred4, aom_dc_predictor_4x4_msa,
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-#else
-#define tm_pred_func aom_tm_predictor_8x8_c
-#define smooth_pred_func NULL
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(C, TestIntraPred8, aom_dc_predictor_8x8_c,
+INTRA_PRED_TEST(C_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_c,
                 aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
                 aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
                 aom_h_predictor_8x8_c, aom_d45e_predictor_8x8_c,
                 aom_d135_predictor_8x8_c, aom_d117_predictor_8x8_c,
                 aom_d153_predictor_8x8_c, aom_d207e_predictor_8x8_c,
-                aom_d63e_predictor_8x8_c, tm_pred_func, smooth_pred_func,
-                smooth_v_pred_func, smooth_h_pred_func)
-#undef tm_pred_func
-#undef smooth_pred_func
+                aom_d63e_predictor_8x8_c, aom_paeth_predictor_8x8_c,
+                aom_smooth_predictor_8x8_c, smooth_v_pred_func,
+                smooth_h_pred_func)
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
-#if HAVE_SSE2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_8x4_c
+#define smooth_h_pred_func aom_smooth_h_predictor_8x4_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_c,
+                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
+                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
+                aom_h_predictor_8x4_c, aom_d45e_predictor_8x4_c,
+                aom_d135_predictor_8x4_c, aom_d117_predictor_8x4_c,
+                aom_d153_predictor_8x4_c, aom_d207e_predictor_8x4_c,
+                aom_d63e_predictor_8x4_c, aom_paeth_predictor_8x4_c,
+                aom_smooth_predictor_8x4_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_8x16_c
+#define smooth_h_pred_func aom_smooth_h_predictor_8x16_c
 #else
-#define tm_pred_func aom_tm_predictor_8x8_sse2
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(SSE2, TestIntraPred8, aom_dc_predictor_8x8_sse2,
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_c,
+                aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
+                aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
+                aom_h_predictor_8x16_c, aom_d45e_predictor_8x16_c,
+                aom_d135_predictor_8x16_c, aom_d117_predictor_8x16_c,
+                aom_d153_predictor_8x16_c, aom_d207e_predictor_8x16_c,
+                aom_d63e_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+                aom_smooth_predictor_8x16_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_sse2,
                 aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
                 aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
                 aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_sse2,
+                aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
+                aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
+                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_sse2,
+                aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
+                aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
+                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_d153_predictor_8x8_ssse3, NULL, NULL, NULL,
-                NULL, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_1, TestIntraPred8, "intra8x8", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_8x8_ssse3,
+                NULL, NULL, aom_paeth_predictor_8x8_ssse3,
+                aom_smooth_predictor_8x8_ssse3, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_2, TestIntraPred8, "intra8x4", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
+                NULL, NULL)
+INTRA_PRED_TEST(SSSE3_3, TestIntraPred8, "intra8x16", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
+                NULL, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_8x8_dspr2
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, aom_dc_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_dspr2,
+                NULL, NULL, NULL, NULL, aom_h_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_8x8_neon
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(NEON, TestIntraPred8, aom_dc_predictor_8x8_neon,
+INTRA_PRED_TEST(NEON, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_neon,
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
                 aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
                 aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_8x8_msa
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(MSA, TestIntraPred8, aom_dc_predictor_8x8_msa,
+INTRA_PRED_TEST(MSA, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_msa,
                 aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
                 aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
                 aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 16x16
 
-#if CONFIG_ALT_INTRA
-#define tm_pred_func aom_paeth_predictor_16x16_c
-#define smooth_pred_func aom_smooth_predictor_16x16_c
 #if CONFIG_SMOOTH_HV
 #define smooth_v_pred_func aom_smooth_v_predictor_16x16_c
 #define smooth_h_pred_func aom_smooth_h_predictor_16x16_c
@@ -419,87 +595,130 @@ INTRA_PRED_TEST(MSA, TestIntraPred8, aom_dc_predictor_8x8_msa,
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-#else
-#define tm_pred_func aom_tm_predictor_16x16_c
-#define smooth_pred_func NULL
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(C, TestIntraPred16, aom_dc_predictor_16x16_c,
+INTRA_PRED_TEST(C_1, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_c,
                 aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
                 aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
                 aom_h_predictor_16x16_c, aom_d45e_predictor_16x16_c,
                 aom_d135_predictor_16x16_c, aom_d117_predictor_16x16_c,
                 aom_d153_predictor_16x16_c, aom_d207e_predictor_16x16_c,
-                aom_d63e_predictor_16x16_c, tm_pred_func, smooth_pred_func,
-                smooth_v_pred_func, smooth_h_pred_func)
-#undef tm_pred_func
-#undef smooth_pred_func
+                aom_d63e_predictor_16x16_c, aom_paeth_predictor_16x16_c,
+                aom_smooth_predictor_16x16_c, smooth_v_pred_func,
+                smooth_h_pred_func)
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
-#if HAVE_SSE2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_16x8_c
+#define smooth_h_pred_func aom_smooth_h_predictor_16x8_c
 #else
-#define tm_pred_func aom_tm_predictor_16x16_sse2
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(SSE2, TestIntraPred16, aom_dc_predictor_16x16_sse2,
-                aom_dc_left_predictor_16x16_sse2,
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred16, "intra16x8", aom_dc_predictor_16x8_c,
+                aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
+                aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
+                aom_h_predictor_16x8_c, aom_d45e_predictor_16x8_c,
+                aom_d135_predictor_16x8_c, aom_d117_predictor_16x8_c,
+                aom_d153_predictor_16x8_c, aom_d207e_predictor_16x8_c,
+                aom_d63e_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+                aom_smooth_predictor_16x8_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_16x32_c
+#define smooth_h_pred_func aom_smooth_h_predictor_16x32_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_3, TestIntraPred16, "intra16x32", aom_dc_predictor_16x32_c,
+                aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
+                aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
+                aom_h_predictor_16x32_c, aom_d45e_predictor_16x32_c,
+                aom_d135_predictor_16x32_c, aom_d117_predictor_16x32_c,
+                aom_d153_predictor_16x32_c, aom_d207e_predictor_16x32_c,
+                aom_d63e_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+                aom_smooth_predictor_16x32_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TestIntraPred16, "intra16x16",
+                aom_dc_predictor_16x16_sse2, aom_dc_left_predictor_16x16_sse2,
                 aom_dc_top_predictor_16x16_sse2,
                 aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
                 aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred16, "intra16x8",
+                aom_dc_predictor_16x8_sse2, aom_dc_left_predictor_16x8_sse2,
+                aom_dc_top_predictor_16x8_sse2, aom_dc_128_predictor_16x8_sse2,
+                aom_v_predictor_16x8_sse2, aom_h_predictor_16x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TestIntraPred16, "intra16x32",
+                aom_dc_predictor_16x32_sse2, aom_dc_left_predictor_16x32_sse2,
+                aom_dc_top_predictor_16x32_sse2,
+                aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
+                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, aom_d153_predictor_16x16_ssse3, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_16x16_ssse3,
+                NULL, NULL, aom_paeth_predictor_16x16_ssse3,
+                aom_smooth_predictor_16x16_ssse3, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
+                NULL, NULL)
+INTRA_PRED_TEST(SSSE3_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_ssse3,
+                aom_smooth_predictor_16x32_ssse3, NULL, NULL)
 #endif  // HAVE_SSSE3
 
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred16, aom_dc_predictor_16x16_dspr2, NULL,
-                NULL, NULL, NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, "intra16x16",
+                aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL, NULL,
+                aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_16x16_neon
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(NEON, TestIntraPred16, aom_dc_predictor_16x16_neon,
-                aom_dc_left_predictor_16x16_neon,
+INTRA_PRED_TEST(NEON, TestIntraPred16, "intra16x16",
+                aom_dc_predictor_16x16_neon, aom_dc_left_predictor_16x16_neon,
                 aom_dc_top_predictor_16x16_neon,
                 aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
                 aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_16x16_msa
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(MSA, TestIntraPred16, aom_dc_predictor_16x16_msa,
+INTRA_PRED_TEST(MSA, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_msa,
                 aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
                 aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
                 aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 32x32
 
-#if CONFIG_ALT_INTRA
-#define tm_pred_func aom_paeth_predictor_32x32_c
-#define smooth_pred_func aom_smooth_predictor_32x32_c
 #if CONFIG_SMOOTH_HV
 #define smooth_v_pred_func aom_smooth_v_predictor_32x32_c
 #define smooth_h_pred_func aom_smooth_h_predictor_32x32_c
@@ -507,73 +726,765 @@ INTRA_PRED_TEST(MSA, TestIntraPred16, aom_dc_predictor_16x16_msa,
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-#else
-#define tm_pred_func aom_tm_predictor_32x32_c
-#define smooth_pred_func NULL
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(C, TestIntraPred32, aom_dc_predictor_32x32_c,
+INTRA_PRED_TEST(C_1, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_c,
                 aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
                 aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
                 aom_h_predictor_32x32_c, aom_d45e_predictor_32x32_c,
                 aom_d135_predictor_32x32_c, aom_d117_predictor_32x32_c,
                 aom_d153_predictor_32x32_c, aom_d207e_predictor_32x32_c,
-                aom_d63e_predictor_32x32_c, tm_pred_func, smooth_pred_func,
-                smooth_v_pred_func, smooth_h_pred_func)
-#undef tm_pred_func
-#undef smooth_pred_func
+                aom_d63e_predictor_32x32_c, aom_paeth_predictor_32x32_c,
+                aom_smooth_predictor_32x32_c, smooth_v_pred_func,
+                smooth_h_pred_func)
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
-#if HAVE_SSE2
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_32x16_c
+#define smooth_h_pred_func aom_smooth_h_predictor_32x16_c
 #else
-#define tm_pred_func aom_tm_predictor_32x32_sse2
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(SSE2, TestIntraPred32, aom_dc_predictor_32x32_sse2,
-                aom_dc_left_predictor_32x32_sse2,
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred32, "intra32x16", aom_dc_predictor_32x16_c,
+                aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
+                aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
+                aom_h_predictor_32x16_c, aom_d45e_predictor_32x16_c,
+                aom_d135_predictor_32x16_c, aom_d117_predictor_32x16_c,
+                aom_d153_predictor_32x16_c, aom_d207e_predictor_32x16_c,
+                aom_d63e_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+                aom_smooth_predictor_32x16_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TestIntraPred32, "intra32x32",
+                aom_dc_predictor_32x32_sse2, aom_dc_left_predictor_32x32_sse2,
                 aom_dc_top_predictor_32x32_sse2,
                 aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
                 aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred32, "intra32x16",
+                aom_dc_predictor_32x16_sse2, aom_dc_left_predictor_32x16_sse2,
+                aom_dc_top_predictor_32x16_sse2,
+                aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
+                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, aom_d153_predictor_32x32_ssse3, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_1, TestIntraPred32, "intra32x32", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_32x32_ssse3,
+                NULL, NULL, aom_paeth_predictor_32x32_ssse3,
+                aom_smooth_predictor_32x32_ssse3, NULL, NULL)
+INTRA_PRED_TEST(SSSE3_2, TestIntraPred32, "intra32x16", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x16_ssse3,
+                aom_smooth_predictor_32x16_ssse3, NULL, NULL)
 #endif  // HAVE_SSSE3
 
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TestIntraPred32, "intra32x32",
+                aom_dc_predictor_32x32_avx2, aom_dc_left_predictor_32x32_avx2,
+                aom_dc_top_predictor_32x32_avx2,
+                aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
+                aom_h_predictor_32x32_avx2, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TestIntraPred32, "intra32x16",
+                aom_dc_predictor_32x16_avx2, aom_dc_left_predictor_32x16_avx2,
+                aom_dc_top_predictor_32x16_avx2,
+                aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_32x32_neon
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(NEON, TestIntraPred32, aom_dc_predictor_32x32_neon,
-                aom_dc_left_predictor_32x32_neon,
+INTRA_PRED_TEST(NEON, TestIntraPred32, "intra32x32",
+                aom_dc_predictor_32x32_neon, aom_dc_left_predictor_32x32_neon,
                 aom_dc_top_predictor_32x32_neon,
                 aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
                 aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-#if CONFIG_ALT_INTRA
-#define tm_pred_func NULL
-#else
-#define tm_pred_func aom_tm_predictor_32x32_msa
-#endif  // CONFIG_ALT_INTRA
-INTRA_PRED_TEST(MSA, TestIntraPred32, aom_dc_predictor_32x32_msa,
+INTRA_PRED_TEST(MSA, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_msa,
                 aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
                 aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
                 aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                tm_pred_func, NULL, NULL, NULL)
-#undef tm_pred_func
+                NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
+// -----------------------------------------------------------------------------
+// High Bitdepth
+#if CONFIG_HIGHBITDEPTH
+namespace {
+
+typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+                                  const uint16_t *above, const uint16_t *left,
+                                  int bd);
+
+typedef IntraPredTestMem<uint16_t> Av1HighbdIntraPredTestMem;
+
+void TestHighbdIntraPred(const char name[], AvxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[], int block_width,
+                         int block_height) {
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1HighbdIntraPredTestMem intra_pred_test_mem;
+  const uint16_t *const above = intra_pred_test_mem.above_mem + 16;
+  const int bd = 12;
+
+  intra_pred_test_mem.Init(block_width, bd);
+
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left, bd);
+    }
+    libaom_test::ClearSystemState();
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
+  }
+}
+
+void TestHighbdIntraPred4(const char *block_name,
+                          AvxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures4x4[kNumAv1IntraFuncs] = {
+    "11f74af6c5737df472f3275cbde062fa",
+    "51bea056b6447c93f6eb8f6b7e8f6f71",
+    "27e97f946766331795886f4de04c5594",
+    "53ab15974b049111fb596c5168ec7e3f",
+    "f0b640bb176fbe4584cf3d32a9b0320a",
+    "729783ca909e03afd4b47111c80d967b",
+    "d631a8544ccc87702db3e98fac494657",
+    "293fc903254a33754133314c6cdba81f",
+    "f8074d704233e73dfd35b458c6092374",
+    "aa6363d08544a1ec4da33d7a0be5640d",
+    "0bdc21a3acdebc393bc2c22e71bbeada",
+    "a48f7a484ba4ad3916055c7160665b56",
+    "6e30009c45474a22032678b1bd579c8f",
+    "e57cba016d808aa8a35619df2a65f049",
+#if CONFIG_SMOOTH_HV
+    "55a6c37f39afcbbf5abca4a985b96459",
+    "a623d45b37dafec1f8a75c4c5218913d",
+#endif  // CONFIG_SMOOTH_HV
+  };
+  static const char *const kSignatures4x8[kNumAv1IntraFuncs] = {
+    "22d519b796d59644043466320e4ccd14",
+    "09513a738c49b3f9542d27f34abbe1d5",
+    "807ae5e8813443ff01e71be6efacfb69",
+    "cbfa18d0293430b6e9708b0be1fd2394",
+    "346c354c34ec7fa780b576db355dab88",
+    "f97dae85c35359632380b09ca98d611e",
+    "aed1beef71de33856c814ff7d63dd9db",
+    "49c47c04dd3d23d6fc5cc32bf9d40ae4",
+    "a24aade6e22b323ee28c8bf08aa2d234",
+    "aefef502f9e144e71cd27dc7383b3c28",
+    "b284ae5277b85ebdd16b5952149f7458",
+    "8dc5791167271f6f347582e07379f580",
+    "698ae351d8896d89ed9e4e67b6e53eda",
+    "dcc197034a9c45a3d8238bf085835f4e",
+#if CONFIG_SMOOTH_HV
+    "7a35e2c42ffdc2efc2d6d1d75a100fc7",
+    "41ab6cebd4516c87a91b2a593e2c2506",
+#endif
+  };
+
+  if (!strcmp(block_name, "Hbd Intra4x4")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures4x4, 4, 4);
+  }
+  if (!strcmp(block_name, "Hbd Intra4x8")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures4x8, 4, 8);
+  }
+}
+
+void TestHighbdIntraPred8(const char *block_name,
+                          AvxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures8x8[kNumAv1IntraFuncs] = {
+    "03da8829fe94663047fd108c5fcaa71d",
+    "ecdb37b8120a2d3a4c706b016bd1bfd7",
+    "1d4543ed8d2b9368cb96898095fe8a75",
+    "f791c9a67b913cbd82d9da8ecede30e2",
+    "065c70646f4dbaff913282f55a45a441",
+    "51f87123616662ef7c35691497dfd0ba",
+    "4f53cf8e5f43894dc0759f43c7081f60",
+    "9ffe186a6bc7db95275f1bbddd6f7aba",
+    "a3258a2eae2e2bd55cb8f71351b22998",
+    "8d909f0a2066e39b3216092c6289ece4",
+    "6751f60655aba44aff78aaaf4e967377",
+    "d31a449872fab968a8d41de578338780",
+    "85c01ba03df68f9ece7bd3fa0f8980e6",
+    "ad19b7dac092f56df6d054e1f67f21e7",
+#if CONFIG_SMOOTH_HV
+    "0edc415b5dd7299f7a34fb9f71d31d78",
+    "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+#endif  // CONFIG_SMOOTH_HV
+  };
+  static const char *const kSignatures8x4[kNumAv1IntraFuncs] = {
+    "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
+    "6e572c35aa782d00cafcb99e9ea047ea",
+    "e8c22a3702b416dc9ab974505afbed09",
+    "aaa4e4762a795aad7ad74de0c662c4e4",
+    "a19f9101967383c3dcbd516dc317a291",
+    "9ab8cb91f1a595b9ebe3fe8de58031aa",
+    "c6c7d65264397d4d31e378e1f1cfd921",
+    "5804158e463ff794b6b8a623f5d2c10d",
+    "c342cdeb39aae4c4f7be10e057029298",
+    "c1bbbcfe4b25f6b8eca6ad2f7ee793d3",
+    "98d1dab8b949859b9c65298ee9f105f8",
+    "396e803aaf6d7a03a231edc48b396051",
+    "2cf9021d5f1169268699807ee118b65f",
+    "ee9605fcbd6fb871f1c5cd81a6989327",
+#if CONFIG_SMOOTH_HV
+    "0edc415b5dd7299f7a34fb9f71d31d78",
+    "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+#endif
+  };
+  static const char *const kSignatures8x16[kNumAv1IntraFuncs] = {
+    "4562de1d0336610880fdd5685498a9ec",
+    "16310fa7076394f16fc85c4b149d89c9",
+    "0e94af88e1dc573b6f0f499cddd1f530",
+    "dfd245ee20d091c67809160340365aa9",
+    "d3562504327f70c096c5be23fd8a3747",
+    "601b853558502acbb5135eadd2da117a",
+    "e83f9a8bc16b507d2ed0b6b31a25d6f5",
+    "fc8427d942246e8cba81247bb294afb5",
+    "89cde712e4c1ef675ea156ad679c62c7",
+    "0a68c2b28c3b171ad797cf76a7058f10",
+    "e70724010e12d8f374cedd3910ceb0d5",
+    "ad7987e91267503ba6fd3e8be42eb48c",
+    "3c624345a723a1b2b1bea05a6a08bc99",
+    "2a9c781de609e0184cc7ab442050f4e5",
+#if CONFIG_SMOOTH_HV
+    "0ddc5035c22252747126b61fc238c74d",
+    "e43f5d83bab759af69c7b6773fc8f9b2",
+#endif
+  };
+  if (!strcmp(block_name, "Hbd Intra8x8")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x8, 8, 8);
+  }
+  if (!strcmp(block_name, "Hbd Intra8x4")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x4, 8, 4);
+  }
+  if (!strcmp(block_name, "Hbd Intra8x16")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x16, 8, 16);
+  }
+}
+
+void TestHighbdIntraPred16(const char *block_name,
+                           AvxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures16x16[kNumAv1IntraFuncs] = {
+    "e33cb3f56a878e2fddb1b2fc51cdd275",
+    "c7bff6f04b6052c8ab335d726dbbd52d",
+    "d0b0b47b654a9bcc5c6008110a44589b",
+    "78f5da7b10b2b9ab39f114a33b6254e9",
+    "c78e31d23831abb40d6271a318fdd6f3",
+    "90d1347f4ec9198a0320daecb6ff90b8",
+    "e38e12830e2ee5a01a064ec5998d5948",
+    "cf28bd387b81ad3e5f1a1c779a4b70a0",
+    "24c304330431ddeaf630f6ce94af2eac",
+    "91a329798036bf64e8e00a87b131b8b1",
+    "e536338d1a8ee192b9e591855db1a222",
+    "54ecd47737f71c62d24e3779585113f2",
+    "e63ded54ab3d0e8728b6f24d4f01e53f",
+    "35ce21fbe0ea114c089fc3489a78155d",
+#if CONFIG_SMOOTH_HV
+    "f277f6ef8e4d717f1f0dfe2706ac197d",
+    "e8014d3f41256976c02e0f1e622ba2b9",
+#endif  // CONFIG_SMOOTH_HV
+  };
+  static const char *const kSignatures16x8[kNumAv1IntraFuncs] = {
+    "a57d6b5a9bfd30c29591d8717ace9c51",
+    "f5907ba97ee6c53e339e953fc8d845ee",
+    "ea3aa727913ce45af06f89dd1808db5f",
+    "408af4f23e48d14b48ee35ae094fcd18",
+    "85c41cbcb5d744f7961e8950026fbffe",
+    "8a4e588a837638887ba671f8d4910485",
+    "caae3cc3d419bbd28aa389dbe4febee1",
+    "ea67fb80d71b6471467c79662af1186c",
+    "c83f7252412dd1ad2fc6af848e7f6be8",
+    "f45af3d697f42f1b9b8def4e46bac78c",
+    "dca4a2aaf5f63db387e264ba5963943a",
+    "d01b1bcc50b4b66c1231142eae628cd3",
+    "b792d8826b67a21757ea7097cff9e05b",
+    "f94ce7101bb87fd3bb9312112527dbf4",
+#if CONFIG_SMOOTH_HV
+    "688c6660a6dc6fa61fa1aa38e708c209",
+    "0cdf641b4f81d69509c92ae0b93ef5ff",
+#endif
+  };
+  static const char *const kSignatures16x32[kNumAv1IntraFuncs] = {
+    "aee4b3b0e3cc02d48e2c40d77f807927",
+    "8baef2b2e789f79c8df9d90ad10f34a4",
+    "038c38ee3c4f090bb8d736eab136aafc",
+    "1a3de2aaeaffd68a9fd6c7f6557b83f3",
+    "385c6e0ea29421dd81011a2934641e26",
+    "6cf96c285d1a2d4787f955dad715b08c",
+    "21f82421fda1c3afca8baca0dc048a52",
+    "eac3734852c99a051f6d15a921d9e7b9",
+    "c81f7ffec79508bf78d0f2c67d8abe96",
+    "14b8c62304f65a06653b9b35dfe12d97",
+    "e0893310042511275ae04e5186ee5326",
+    "b4f05903a6191093be719794417ac6fd",
+    "2d7f75dcd73b9528c8396279ff09ff3a",
+    "5a63cd1841e4ed470e4ca5ef845f2281",
+#if CONFIG_SMOOTH_HV
+    "610d899ca945fbead33287d4335a8b32",
+    "6bafaad81fce37be46730187e78d8b11",
+#endif
+  };
+  if (!strcmp(block_name, "Hbd Intra16x16")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x16, 16, 16);
+  }
+  if (!strcmp(block_name, "Hbd Intra16x8")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x8, 16, 8);
+  }
+  if (!strcmp(block_name, "Hbd Intra16x32")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x32, 16, 32);
+  }
+}
+
+void TestHighbdIntraPred32(const char *block_name,
+                           AvxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures32x32[kNumAv1IntraFuncs] = {
+    "a3e8056ba7e36628cce4917cd956fedd",
+    "cc7d3024fe8748b512407edee045377e",
+    "2aab0a0f330a1d3e19b8ecb8f06387a3",
+    "a547bc3fb7b06910bf3973122a426661",
+    "26f712514da95042f93d6e8dc8e431dc",
+    "bb08c6e16177081daa3d936538dbc2e3",
+    "4e10f10b082a5b4265080c102d34eb47",
+    "42867c8553285e94ee8e4df7abafbda8",
+    "6496bdee96100667833f546e1be3d640",
+    "2ebfa25bf981377e682e580208504300",
+    "1788695b10a6f82ae1a56686dcbcd0a9",
+    "c3b9c506604a7132bbb5f4e97bdb03f0",
+    "84bf83f94a51b33654ca940c6f8bc057",
+    "7168b03fc31bf29596a344d6a35d007c",
+#if CONFIG_SMOOTH_HV
+    "b073a70d3672f1282236994f5d12e94b",
+    "c51607aebad5dcb3c1e3b58ef9e5b84e",
+#endif  // CONFIG_SMOOTH_HV
+  };
+  static const char *const kSignatures32x16[kNumAv1IntraFuncs] = {
+    "290b23c9f5a1de7905bfa71a942da29b",
+    "701e7b82593c66da5052fc4b6afd79ce",
+    "4da828c5455cd246735a663fbb204989",
+    "e3fbeaf234efece8dbd752b77226200c",
+    "4d1d8c969f05155a7e7e84cf7aad021b",
+    "c22e4877c2c946d5bdc0d542e29e70cf",
+    "ffd86b234d65c2e1386a5b5b5c188a69",
+    "50aaaa7d90e300b635ab18cdd73e189b",
+    "a945dc7429df168e2169d81b58a15859",
+    "66725070d7fad02dee78730ba0843e19",
+    "33d873cb05d45df2af4ff59033833db7",
+    "0dd783695b69271f65d56f5516fa6dc0",
+    "8ac1ce815e7780500f842b0beb0bb980",
+    "9fee2e2502b507f25bfad30a55b0b610",
+#if CONFIG_SMOOTH_HV
+    "4ced9c212ec6f9956e27f68a91b59fef",
+    "4a7a0b93f138bb0863e4e465b01ec0b1",
+#endif
+  };
+  if (!strcmp(block_name, "Hbd Intra32x32")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures32x32, 32, 32);
+  }
+  if (!strcmp(block_name, "Hbd Intra32x16")) {
+    TestHighbdIntraPred(block_name, pred_funcs, kSignatures32x16, 32, 16);
+  }
+}
+
+}  // namespace
+
+#define HIGHBD_INTRA_PRED_TEST(arch, test_func, block_size, dc, dc_left,     \
+                               dc_top, dc_128, v, h, d45e, d135, d117, d153, \
+                               d207e, d63e, tm, smooth, smooth_v, smooth_h)  \
+  TEST(arch, DISABLED_##test_func) {                                         \
+    static const AvxHighbdPredFunc aom_intra_pred[] = {                      \
+      dc,   dc_left, dc_top, dc_128, v,  h,      d45e,     d135,             \
+      d117, d153,    d207e,  d63e,   tm, smooth, smooth_v, smooth_h          \
+    };                                                                       \
+    test_func(block_size, aom_intra_pred);                                   \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_4x4_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_4x4_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TestHighbdIntraPred4, "Hbd Intra4x4", aom_highbd_dc_predictor_4x4_c,
+    aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
+    aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
+    aom_highbd_h_predictor_4x4_c, aom_highbd_d45e_predictor_4x4_c,
+    aom_highbd_d135_predictor_4x4_c, aom_highbd_d117_predictor_4x4_c,
+    aom_highbd_d153_predictor_4x4_c, aom_highbd_d207e_predictor_4x4_c,
+    aom_highbd_d63e_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
+    aom_highbd_smooth_predictor_4x4_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(
+    SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
+    aom_highbd_dc_predictor_4x4_sse2, aom_highbd_dc_left_predictor_4x4_sse2,
+    aom_highbd_dc_top_predictor_4x4_sse2, aom_highbd_dc_128_predictor_4x4_sse2,
+    aom_highbd_v_predictor_4x4_sse2, aom_highbd_h_predictor_4x4_sse2,
+    aom_highbd_d45e_predictor_4x4_sse2, aom_highbd_d135_predictor_4x4_sse2,
+    aom_highbd_d117_predictor_4x4_sse2, aom_highbd_d153_predictor_4x4_sse2,
+    NULL, NULL, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8",
+                       aom_highbd_dc_predictor_4x8_sse2,
+                       aom_highbd_dc_left_predictor_4x8_sse2,
+                       aom_highbd_dc_top_predictor_4x8_sse2,
+                       aom_highbd_dc_128_predictor_4x8_sse2,
+                       aom_highbd_v_predictor_4x8_sse2,
+                       aom_highbd_h_predictor_4x8_sse2,
+                       aom_highbd_d45e_predictor_4x8_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_4x8_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_4x8_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TestHighbdIntraPred4, "Hbd Intra4x8", aom_highbd_dc_predictor_4x8_c,
+    aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
+    aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
+    aom_highbd_h_predictor_4x8_c, aom_highbd_d45e_predictor_4x8_c,
+    aom_highbd_d135_predictor_4x8_c, aom_highbd_d117_predictor_4x8_c,
+    aom_highbd_d153_predictor_4x8_c, aom_highbd_d207e_predictor_4x8_c,
+    aom_highbd_d63e_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
+    aom_highbd_smooth_predictor_4x8_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+// -----------------------------------------------------------------------------
+// 8x8
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x8_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x8_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TestHighbdIntraPred8, "Hbd Intra8x8", aom_highbd_dc_predictor_8x8_c,
+    aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
+    aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
+    aom_highbd_h_predictor_8x8_c, aom_highbd_d45e_predictor_8x8_c,
+    aom_highbd_d135_predictor_8x8_c, aom_highbd_d117_predictor_8x8_c,
+    aom_highbd_d153_predictor_8x8_c, aom_highbd_d207e_predictor_8x8_c,
+    aom_highbd_d63e_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
+    aom_highbd_smooth_predictor_8x8_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred8, "Hbd Intra8x8",
+                       aom_highbd_dc_predictor_8x8_sse2,
+                       aom_highbd_dc_left_predictor_8x8_sse2,
+                       aom_highbd_dc_top_predictor_8x8_sse2,
+                       aom_highbd_dc_128_predictor_8x8_sse2,
+                       aom_highbd_v_predictor_8x8_sse2,
+                       aom_highbd_h_predictor_8x8_sse2,
+                       aom_highbd_d45e_predictor_8x8_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4",
+                       aom_highbd_dc_predictor_8x4_sse2,
+                       aom_highbd_dc_left_predictor_8x4_sse2,
+                       aom_highbd_dc_top_predictor_8x4_sse2,
+                       aom_highbd_dc_128_predictor_8x4_sse2,
+                       aom_highbd_v_predictor_8x4_sse2,
+                       aom_highbd_h_predictor_8x4_sse2,
+                       aom_highbd_d45e_predictor_8x4_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8, "Hbd Intra8x16",
+                       aom_highbd_dc_predictor_8x16_sse2,
+                       aom_highbd_dc_left_predictor_8x16_sse2,
+                       aom_highbd_dc_top_predictor_8x16_sse2,
+                       aom_highbd_dc_128_predictor_8x16_sse2,
+                       aom_highbd_v_predictor_8x16_sse2,
+                       aom_highbd_h_predictor_8x16_sse2,
+                       aom_highbd_d45e_predictor_8x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, "Hbd Intra8x8", NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d135_predictor_8x8_ssse3,
+                       aom_highbd_d117_predictor_8x8_ssse3,
+                       aom_highbd_d153_predictor_8x8_ssse3, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
+#endif
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x4_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x4_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TestHighbdIntraPred8, "Hbd Intra8x4", aom_highbd_dc_predictor_8x4_c,
+    aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
+    aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
+    aom_highbd_h_predictor_8x4_c, aom_highbd_d45e_predictor_8x4_c,
+    aom_highbd_d135_predictor_8x4_c, aom_highbd_d117_predictor_8x4_c,
+    aom_highbd_d153_predictor_8x4_c, aom_highbd_d207e_predictor_8x4_c,
+    aom_highbd_d63e_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
+    aom_highbd_smooth_predictor_8x4_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x16_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x16_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TestHighbdIntraPred8, "Hbd Intra8x16", aom_highbd_dc_predictor_8x16_c,
+    aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
+    aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
+    aom_highbd_h_predictor_8x16_c, aom_highbd_d45e_predictor_8x16_c,
+    aom_highbd_d135_predictor_8x16_c, aom_highbd_d117_predictor_8x16_c,
+    aom_highbd_d153_predictor_8x16_c, aom_highbd_d207e_predictor_8x16_c,
+    aom_highbd_d63e_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
+    aom_highbd_smooth_predictor_8x16_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+// -----------------------------------------------------------------------------
+// 16x16
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x16_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x16_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TestHighbdIntraPred16, "Hbd Intra16x16",
+    aom_highbd_dc_predictor_16x16_c, aom_highbd_dc_left_predictor_16x16_c,
+    aom_highbd_dc_top_predictor_16x16_c, aom_highbd_dc_128_predictor_16x16_c,
+    aom_highbd_v_predictor_16x16_c, aom_highbd_h_predictor_16x16_c,
+    aom_highbd_d45e_predictor_16x16_c, aom_highbd_d135_predictor_16x16_c,
+    aom_highbd_d117_predictor_16x16_c, aom_highbd_d153_predictor_16x16_c,
+    aom_highbd_d207e_predictor_16x16_c, aom_highbd_d63e_predictor_16x16_c,
+    aom_highbd_paeth_predictor_16x16_c, aom_highbd_smooth_predictor_16x16_c,
+    smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred16, "Hbd Intra16x16",
+                       aom_highbd_dc_predictor_16x16_sse2,
+                       aom_highbd_dc_left_predictor_16x16_sse2,
+                       aom_highbd_dc_top_predictor_16x16_sse2,
+                       aom_highbd_dc_128_predictor_16x16_sse2,
+                       aom_highbd_v_predictor_16x16_sse2,
+                       aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred16, "Hbd Intra16x8",
+                       aom_highbd_dc_predictor_16x8_sse2,
+                       aom_highbd_dc_left_predictor_16x8_sse2,
+                       aom_highbd_dc_top_predictor_16x8_sse2,
+                       aom_highbd_dc_128_predictor_16x8_sse2,
+                       aom_highbd_v_predictor_16x8_sse2,
+                       aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred16, "Hbd Intra16x32",
+                       aom_highbd_dc_predictor_16x32_sse2,
+                       aom_highbd_dc_left_predictor_16x32_sse2,
+                       aom_highbd_dc_top_predictor_16x32_sse2,
+                       aom_highbd_dc_128_predictor_16x32_sse2,
+                       aom_highbd_v_predictor_16x32_sse2,
+                       aom_highbd_h_predictor_16x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TestHighbdIntraPred16, "Hbd Intra16x16", NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d135_predictor_16x16_ssse3,
+                       aom_highbd_d117_predictor_16x16_ssse3,
+                       aom_highbd_d153_predictor_16x16_ssse3, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TestHighbdIntraPred16, "Hbd Intra16x16", NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d45e_predictor_16x16_avx2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TestHighbdIntraPred16, "Hbd Intra16x8", NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d45e_predictor_16x8_avx2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_3, TestHighbdIntraPred16, "Hbd Intra16x32", NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d45e_predictor_16x32_avx2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x8_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x8_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TestHighbdIntraPred16, "Hbd Intra16x8", aom_highbd_dc_predictor_16x8_c,
+    aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
+    aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
+    aom_highbd_h_predictor_16x8_c, aom_highbd_d45e_predictor_16x8_c,
+    aom_highbd_d135_predictor_16x8_c, aom_highbd_d117_predictor_16x8_c,
+    aom_highbd_d153_predictor_16x8_c, aom_highbd_d207e_predictor_16x8_c,
+    aom_highbd_d63e_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
+    aom_highbd_smooth_predictor_16x8_c, smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x32_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x32_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TestHighbdIntraPred16, "Hbd Intra16x32",
+    aom_highbd_dc_predictor_16x32_c, aom_highbd_dc_left_predictor_16x32_c,
+    aom_highbd_dc_top_predictor_16x32_c, aom_highbd_dc_128_predictor_16x32_c,
+    aom_highbd_v_predictor_16x32_c, aom_highbd_h_predictor_16x32_c,
+    aom_highbd_d45e_predictor_16x32_c, aom_highbd_d135_predictor_16x32_c,
+    aom_highbd_d117_predictor_16x32_c, aom_highbd_d153_predictor_16x32_c,
+    aom_highbd_d207e_predictor_16x32_c, aom_highbd_d63e_predictor_16x32_c,
+    aom_highbd_paeth_predictor_16x32_c, aom_highbd_smooth_predictor_16x32_c,
+    smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+// -----------------------------------------------------------------------------
+// 32x32
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_32x32_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_32x32_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TestHighbdIntraPred32, "Hbd Intra32x32",
+    aom_highbd_dc_predictor_32x32_c, aom_highbd_dc_left_predictor_32x32_c,
+    aom_highbd_dc_top_predictor_32x32_c, aom_highbd_dc_128_predictor_32x32_c,
+    aom_highbd_v_predictor_32x32_c, aom_highbd_h_predictor_32x32_c,
+    aom_highbd_d45e_predictor_32x32_c, aom_highbd_d135_predictor_32x32_c,
+    aom_highbd_d117_predictor_32x32_c, aom_highbd_d153_predictor_32x32_c,
+    aom_highbd_d207e_predictor_32x32_c, aom_highbd_d63e_predictor_32x32_c,
+    aom_highbd_paeth_predictor_32x32_c, aom_highbd_smooth_predictor_32x32_c,
+    smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred32, "Hbd Intra32x32",
+                       aom_highbd_dc_predictor_32x32_sse2,
+                       aom_highbd_dc_left_predictor_32x32_sse2,
+                       aom_highbd_dc_top_predictor_32x32_sse2,
+                       aom_highbd_dc_128_predictor_32x32_sse2,
+                       aom_highbd_v_predictor_32x32_sse2,
+                       aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred32, "Hbd Intra32x16",
+                       aom_highbd_dc_predictor_32x16_sse2,
+                       aom_highbd_dc_left_predictor_32x16_sse2,
+                       aom_highbd_dc_top_predictor_32x16_sse2,
+                       aom_highbd_dc_128_predictor_32x16_sse2,
+                       aom_highbd_v_predictor_32x16_sse2,
+                       aom_highbd_h_predictor_32x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TestHighbdIntraPred32, "Hbd Intra32x32", NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d135_predictor_32x32_ssse3,
+                       aom_highbd_d117_predictor_32x32_ssse3,
+                       aom_highbd_d153_predictor_32x32_ssse3, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TestHighbdIntraPred32, "Hbd Intra32x32", NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d45e_predictor_32x32_avx2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TestHighbdIntraPred32, "Hbd Intra32x16", NULL,
+                       NULL, NULL, NULL, NULL, NULL,
+                       aom_highbd_d45e_predictor_32x16_avx2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_highbd_smooth_v_predictor_32x16_c
+#define smooth_h_pred_func aom_highbd_smooth_h_predictor_32x16_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TestHighbdIntraPred32, "Hbd Intra32x16",
+    aom_highbd_dc_predictor_32x16_c, aom_highbd_dc_left_predictor_32x16_c,
+    aom_highbd_dc_top_predictor_32x16_c, aom_highbd_dc_128_predictor_32x16_c,
+    aom_highbd_v_predictor_32x16_c, aom_highbd_h_predictor_32x16_c,
+    aom_highbd_d45e_predictor_32x16_c, aom_highbd_d135_predictor_32x16_c,
+    aom_highbd_d117_predictor_32x16_c, aom_highbd_d153_predictor_32x16_c,
+    aom_highbd_d207e_predictor_32x16_c, aom_highbd_d63e_predictor_32x16_c,
+    aom_highbd_paeth_predictor_32x16_c, aom_highbd_smooth_predictor_32x16_c,
+    smooth_v_pred_func, smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+#endif  // CONFIG_HIGHBITDEPTH
+
 #include "test/test_libaom.cc"
diff --git a/third_party/aom/test/warp_filter_test_util.cc b/third_party/aom/test/warp_filter_test_util.cc
index ab3c153e7..47ce6c371 100644
--- a/third_party/aom/test/warp_filter_test_util.cc
+++ b/third_party/aom/test/warp_filter_test_util.cc
@@ -21,9 +21,9 @@ namespace AV1WarpFilter {
 ::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
     warp_affine_func filter) {
   const WarpTestParam params[] = {
-    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
-    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
-    make_tuple(32, 8, 10000, filter),
+    make_tuple(4, 4, 100, filter),   make_tuple(8, 8, 100, filter),
+    make_tuple(64, 64, 100, filter), make_tuple(4, 16, 100, filter),
+    make_tuple(32, 8, 100, filter),
   };
   return ::testing::ValuesIn(params);
 }
@@ -113,34 +113,76 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, 0);
-
-  // Generate an input block and extend its borders horizontally
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * stride + j] = rnd_.Rand8();
-  for (i = 0; i < h; ++i) {
-    memset(input + i * stride - border, input[i * stride], border);
-    memset(input + i * stride + w, input[i * stride + (w - 1)], border);
-  }
+#if CONFIG_CONVOLVE_ROUND
+  int32_t *dsta = new int32_t[output_n];
+  int32_t *dstb = new int32_t[output_n];
+#endif
 
   for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+    for (int r = 0; r < h; ++r) {
+      memset(input + r * stride - border, input[r * stride], border);
+      memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+    }
+#if CONFIG_CONVOLVE_ROUND
+    const int use_no_round = rnd_.Rand8() & 1;
+#endif
     for (sub_x = 0; sub_x < 2; ++sub_x)
       for (sub_y = 0; sub_y < 2; ++sub_y) {
         generate_model(mat, &alpha, &beta, &gamma, &delta);
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          // Prepare two copies of the destination
+          for (j = 0; j < out_w * out_h; ++j) {
+            int32_t v = rnd_.Rand16();
+            dsta[j] = v;
+            dstb[j] = v;
+          }
+          conv_params = get_conv_params_no_round(0, 0, 0, dsta, out_w);
+        } else {
+          conv_params = get_conv_params(0, 0, 0);
+        }
+#endif
         av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
                           out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta,
                           gamma, delta);
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
+        }
+#endif
         test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                   out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
 
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          for (j = 0; j < out_w * out_h; ++j)
+            ASSERT_EQ(dsta[j], dstb[j])
+                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
+                << ", " << (j / out_w) << ") on iteration " << i;
+        } else {
+          for (j = 0; j < out_w * out_h; ++j)
+            ASSERT_EQ(output[j], output2[j])
+                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
+                << ", " << (j / out_w) << ") on iteration " << i;
+        }
+#else
         for (j = 0; j < out_w * out_h; ++j)
           ASSERT_EQ(output[j], output2[j])
               << "Pixel mismatch at index " << j << " = (" << (j % out_w)
               << ", " << (j / out_w) << ") on iteration " << i;
+#endif
       }
   }
   delete[] input_;
   delete[] output;
   delete[] output2;
+#if CONFIG_CONVOLVE_ROUND
+  delete[] dsta;
+  delete[] dstb;
+#endif
 }
 }  // namespace AV1WarpFilter
 
@@ -149,14 +191,14 @@ namespace AV1HighbdWarpFilter {
 
 ::testing::internal::ParamGenerator<HighbdWarpTestParam> GetDefaultParams() {
   const HighbdWarpTestParam defaultParams[] = {
-    make_tuple(4, 4, 50000, 8),   make_tuple(8, 8, 50000, 8),
-    make_tuple(64, 64, 1000, 8),  make_tuple(4, 16, 20000, 8),
-    make_tuple(32, 8, 10000, 8),  make_tuple(4, 4, 50000, 10),
-    make_tuple(8, 8, 50000, 10),  make_tuple(64, 64, 1000, 10),
-    make_tuple(4, 16, 20000, 10), make_tuple(32, 8, 10000, 10),
-    make_tuple(4, 4, 50000, 12),  make_tuple(8, 8, 50000, 12),
-    make_tuple(64, 64, 1000, 12), make_tuple(4, 16, 20000, 12),
-    make_tuple(32, 8, 10000, 12),
+    make_tuple(4, 4, 100, 8),    make_tuple(8, 8, 100, 8),
+    make_tuple(64, 64, 100, 8),  make_tuple(4, 16, 100, 8),
+    make_tuple(32, 8, 100, 8),   make_tuple(4, 4, 100, 10),
+    make_tuple(8, 8, 100, 10),   make_tuple(64, 64, 100, 10),
+    make_tuple(4, 16, 100, 10),  make_tuple(32, 8, 100, 10),
+    make_tuple(4, 4, 100, 12),   make_tuple(8, 8, 100, 12),
+    make_tuple(64, 64, 100, 12), make_tuple(4, 16, 100, 12),
+    make_tuple(32, 8, 100, 12),
   };
   return ::testing::ValuesIn(defaultParams);
 }
@@ -250,39 +292,82 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, 0);
-
-  // Generate an input block and extend its borders horizontally
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * stride + j] = rnd_.Rand16() & mask;
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < border; ++j) {
-      input[i * stride - border + j] = input[i * stride];
-      input[i * stride + w + j] = input[i * stride + (w - 1)];
-    }
-  }
+#if CONFIG_CONVOLVE_ROUND
+  int32_t *dsta = new int32_t[output_n];
+  int32_t *dstb = new int32_t[output_n];
+#endif
 
   for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < border; ++c) {
+        input[r * stride - border + c] = input[r * stride];
+        input[r * stride + w + c] = input[r * stride + (w - 1)];
+      }
+    }
+#if CONFIG_CONVOLVE_ROUND
+    const int use_no_round = rnd_.Rand8() & 1;
+#endif
     for (sub_x = 0; sub_x < 2; ++sub_x)
       for (sub_y = 0; sub_y < 2; ++sub_y) {
         generate_model(mat, &alpha, &beta, &gamma, &delta);
-
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          // Prepare two copies of the destination
+          for (j = 0; j < out_w * out_h; ++j) {
+            int32_t v = rnd_.Rand16();
+            dsta[j] = v;
+            dstb[j] = v;
+          }
+          conv_params = get_conv_params_no_round(0, 0, 0, dsta, out_w);
+        } else {
+          conv_params = get_conv_params(0, 0, 0);
+        }
+#endif
         av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
                                  out_w, out_h, out_w, sub_x, sub_y, bd,
                                  &conv_params, alpha, beta, gamma, delta);
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          // TODO(angiebird): Change this to test_impl once we have SIMD
+          // implementation
+          conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
+        }
+#endif
         test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                   out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma,
                   delta);
 
+#if CONFIG_CONVOLVE_ROUND
+        if (use_no_round) {
+          for (j = 0; j < out_w * out_h; ++j)
+            ASSERT_EQ(dsta[j], dstb[j])
+                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
+                << ", " << (j / out_w) << ") on iteration " << i;
+        } else {
+          for (j = 0; j < out_w * out_h; ++j)
+            ASSERT_EQ(output[j], output2[j])
+                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
+                << ", " << (j / out_w) << ") on iteration " << i;
+        }
+#else
         for (j = 0; j < out_w * out_h; ++j)
           ASSERT_EQ(output[j], output2[j])
               << "Pixel mismatch at index " << j << " = (" << (j % out_w)
               << ", " << (j / out_w) << ") on iteration " << i;
+#endif
       }
   }
 
   delete[] input_;
   delete[] output;
   delete[] output2;
+#if CONFIG_CONVOLVE_ROUND
+  delete[] dsta;
+  delete[] dstb;
+#endif
 }
 }  // namespace AV1HighbdWarpFilter
 #endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/test/webm_video_source.h b/third_party/aom/test/webm_video_source.h
index 286f69cbf..b6c998042 100644
--- a/third_party/aom/test/webm_video_source.h
+++ b/third_party/aom/test/webm_video_source.h
@@ -41,8 +41,8 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     aom_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(aom_ctx_->file != NULL) << "Input file open failed. Filename: "
-                                        << file_name_;
+    ASSERT_TRUE(aom_ctx_->file != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM";
 
diff --git a/third_party/aom/test/y4m_video_source.h b/third_party/aom/test/y4m_video_source.h
index 2279d7970..f70c30be6 100644
--- a/third_party/aom/test/y4m_video_source.h
+++ b/third_party/aom/test/y4m_video_source.h
@@ -35,8 +35,8 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
diff --git a/third_party/aom/test/yuv_video_source.h b/third_party/aom/test/yuv_video_source.h
index 9ff76a8d8..88cabd5bb 100644
--- a/third_party/aom/test/yuv_video_source.h
+++ b/third_party/aom/test/yuv_video_source.h
@@ -44,8 +44,8 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
     if (start_)
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);