23 files changed, 18450 insertions, 0 deletions
diff --git a/media/libvpx/vp9/common/x86/convolve.h b/media/libvpx/vp9/common/x86/convolve.h
new file mode 100644
index 000000000..de2df47e5
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/convolve.h
@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_X86_CONVOLVE_H_
+#define VP9_COMMON_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+  const uint8_t *src_ptr,
+  ptrdiff_t src_pitch,
+  uint8_t *output_ptr,
+  ptrdiff_t out_pitch,
+  uint32_t output_height,
+  const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                    uint8_t *dst, ptrdiff_t dst_stride, \
+                                    const int16_t *filter_x, int x_step_q4, \
+                                    const int16_t *filter_y, int y_step_q4, \
+                                    int w, int h) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+                             filter_x, x_step_q4, filter_y, y_step_q4, \
+                             w, h); \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 7); \
+      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } else { \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 1); \
+      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } \
+  } else { \
+    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+  } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+  const uint16_t *src_ptr,
+  const ptrdiff_t src_pitch,
+  uint16_t *output_ptr,
+  ptrdiff_t out_pitch,
+  unsigned int output_height,
+  const int16_t *filter,
+  int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+                                           ptrdiff_t src_stride, \
+                                           uint8_t *dst8, \
+                                           ptrdiff_t dst_stride, \
+                                           const int16_t *filter_x, \
+                                           int x_step_q4, \
+                                           const int16_t *filter_y, \
+                                           int y_step_q4, \
+                                           int w, int h, int bd) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd); \
+  } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                     uint8_t *dst, ptrdiff_t dst_stride, \
+                                     const int16_t *filter_x, int x_step_q4, \
+                                     const int16_t *filter_y, int y_step_q4, \
+                                     int w, int h, int bd) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+      vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 7, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+                                             64, dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } else { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+      vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 1, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                             dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } \
+  } else { \
+    vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                  h, bd); \
+  } \
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VP9_COMMON_X86_CONVOLVE_H_
diff --git a/media/libvpx/vp9/common/x86/vp9_asm_stubs.c b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c
new file mode 100644
index 000000000..fd55fb8c6
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -0,0 +1,162 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction vp9_filter_block1d16_v8_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_sse2;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+
+filter8_1dfunction vp9_filter_block1d16_v2_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+
+// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2;
+
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+                 sse2);
+
+// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2);
+HIGH_FUN_CONV_2D(avg_ , sse2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+#endif  // HAVE_SSE2
diff --git a/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 000000000..b26383708
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm
@@ -0,0 +1,156 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+                              fx, fxs, fy, fys, w, h
+  mov r4d, dword wm
+  cmp r4d, 4
+  je .w4
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+16]
+  pavgb                   m2, [dstq+32]
+  pavgb                   m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq            +16]
+  pavgb                   m2, [dstq+dst_strideq]
+  pavgb                   m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+INIT_MMX sse
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavgb                   m0, m4
+  pavgb                   m1, m5
+  pavgb                   m2, m6
+  pavgb                   m3, m7
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
diff --git a/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
new file mode 100644
index 000000000..b12d29c0a
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
@@ -0,0 +1,476 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_MMX sse
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, one
+  mov                 oned, 0x0001
+  pxor                  m1, m1
+  movd                  m3, oned
+  pshufw                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  mova                  m5, [leftq]
+  mova                  m6, [leftq+16]
+  mova                  m7, [leftq+32]
+  mova                  m8, [leftq+48]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  paddw                 m0, m5
+  paddw                 m0, m6
+  paddw                 m0, m7
+  paddw                 m0, m8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+%endif
+
+INIT_MMX sse
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  movq                  m0, [aboveq]
+  pshufw                m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  movd                  m3, oned
+  movd                  m4, bpsd
+  pshufw                m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  mova                  m2, m3
+  psllw                 m3, m4
+  add                leftq, 8
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movq                  m1, [leftq+lineq*4]
+  movq                  m2, [leftq+lineq*4+2]
+  pshufw                m1, m1, 0x0
+  pshufw                m2, m2, 0x0
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m1
+  movq    [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  mova                  m0, [aboveq]
+  pshuflw               m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m3, m3
+  pxor                  m4, m4
+  pinsrw                m3, oned, 0
+  pinsrw                m4, bpsd, 0
+  pshuflw               m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m3, m3
+  mov                lineq, -4
+  mova                  m2, m3
+  punpcklqdq            m1, m1
+  psllw                 m3, m4
+  add                leftq, 16
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movd                  m1, [leftq+lineq*4]
+  movd                  m2, [leftq+lineq*4+2]
+  pshuflw               m1, m1, 0x0
+  pshuflw               m2, m2, 0x0
+  punpcklqdq            m1, m1
+  punpcklqdq            m2, m2
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  mova      [dstq          ], m1
+  mova      [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
+  movd                  m2, [aboveq-2]
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  pshuflw               m2, m2, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m7, m7
+  pxor                  m8, m8
+  pinsrw                m7, oned, 0
+  pinsrw                m8, bpsd, 0
+  pshuflw               m7, m7, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m7, m7
+  mov                lineq, -8
+  mova                  m5, m7
+  punpcklqdq            m2, m2
+  psllw                 m7, m8
+  add                leftq, 32
+  psubw                 m7, m5 ; max possible value
+  pxor                  m8, m8 ; min possible value
+  psubw                 m0, m2
+  psubw                 m1, m2
+.loop:
+  movd                  m2, [leftq+lineq*4]
+  movd                  m3, [leftq+lineq*4+2]
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m4, m2, m0
+  paddw                 m5, m3, m0
+  paddw                 m2, m1
+  paddw                 m3, m1
+  ;Clamp to the bit-depth
+  pminsw                m4, m7
+  pminsw                m5, m7
+  pminsw                m2, m7
+  pminsw                m3, m7
+  pmaxsw                m4, m8
+  pmaxsw                m5, m8
+  pmaxsw                m2, m8
+  pmaxsw                m3, m8
+  ;Store the values
+  mova   [dstq             ], m4
+  mova   [dstq+strideq*2   ], m5
+  mova   [dstq          +16], m2
+  mova   [dstq+strideq*2+16], m3
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+  movd                  m0, [aboveq-2]
+  mova                  m1, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  pshuflw               m0, m0, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                 m10, m10
+  pxor                 m11, m11
+  pinsrw               m10, oned, 0
+  pinsrw               m11, bpsd, 0
+  pshuflw              m10, m10, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq           m10, m10
+  mov                lineq, -16
+  mova                  m5, m10
+  punpcklqdq            m0, m0
+  psllw                m10, m11
+  add                leftq, 64
+  psubw                m10, m5 ; max possible value
+  pxor                 m11, m11 ; min possible value
+  psubw                 m1, m0
+  psubw                 m2, m0
+  psubw                 m3, m0
+  psubw                 m4, m0
+.loop:
+  movd                  m5, [leftq+lineq*4]
+  movd                  m6, [leftq+lineq*4+2]
+  pshuflw               m5, m5, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m5, m5
+  punpcklqdq            m6, m6
+  paddw                 m7, m5, m1
+  paddw                 m8, m5, m2
+  paddw                 m9, m5, m3
+  paddw                 m5, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m5, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m5, m11
+  ;Store these values
+  mova   [dstq           ], m7
+  mova   [dstq        +16], m8
+  mova   [dstq        +32], m9
+  mova   [dstq        +48], m5
+  paddw                 m7, m6, m1
+  paddw                 m8, m6, m2
+  paddw                 m9, m6, m3
+  paddw                 m6, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m6, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m6, m11
+  ;Store these values
+  mova   [dstq+strideq*2   ], m7
+  mova   [dstq+strideq*2+16], m8
+  mova   [dstq+strideq*2+32], m9
+  mova   [dstq+strideq*2+48], m6
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif
diff --git a/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
new file mode 100644
index 000000000..b40669c63
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
@@ -0,0 +1,1215 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+  __m128i ubounded;
+  __m128i lbounded;
+  __m128i retval;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i t80, max, min;
+
+  if (bd == 8) {
+    t80 = _mm_set1_epi16(0x80);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+  } else if (bd == 10) {
+    t80 = _mm_set1_epi16(0x200);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+  } else {  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+  }
+
+  min = _mm_subs_epi16(zero, t80);
+
+  ubounded = _mm_cmpgt_epi16(value, max);
+  lbounded = _mm_cmplt_epi16(value, min);
+  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+  ubounded = _mm_and_si128(ubounded, max);
+  lbounded = _mm_and_si128(lbounded, min);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_or_si128(retval, lbounded);
+  return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+                                                   int p,
+                                                   const uint8_t *_blimit,
+                                                   const uint8_t *_limit,
+                                                   const uint8_t *_thresh,
+                                                   int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i blimit, limit, thresh;
+  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+  __m128i ps1, qs1, ps0, qs0;
+  __m128i abs_p0q0, abs_p1q1, ffff, work;
+  __m128i filt, work_a, filter1, filter2;
+  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+  __m128i flat2_q0, flat2_p0;
+  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+  __m128i pixelFilter_p, pixelFilter_q;
+  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+  __m128i t4, t3, t80, t1;
+  __m128i eight, four;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+  }
+
+  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+  //  highbd_filter_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+                                    _mm_subs_epu16(p0, p1)),
+                       _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                    _mm_subs_epu16(q0, q1)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+
+  // lp filter
+  // highbd_filter4
+  t4 = _mm_set1_epi16(4);
+  t3 = _mm_set1_epi16(3);
+  if (bd == 8)
+    t80 = _mm_set1_epi16(0x80);
+  else if (bd == 10)
+    t80 = _mm_set1_epi16(0x200);
+  else  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+
+  t1 = _mm_set1_epi16(0x1);
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  filt = _mm_and_si128(filt, mask);
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+      t80);
+  ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+  qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+      t80);
+  ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+      t80);
+
+  // end highbd_filter4
+  // loopfilter done
+
+  // highbd_flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  flat = _mm_max_epi16(work, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  // end flat_mask4
+
+  // flat & mask = flat && mask (as used in filter8)
+  // (because, in both vars, each block of 16 either all 1s or all 0s)
+  flat = _mm_and_si128(flat, mask);
+
+  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+  // but referred to as p0-p4 & q0-q4 in fn)
+  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+                                     _mm_subs_epu16(p0, p4)),
+                        _mm_or_si128(_mm_subs_epu16(q4, q0),
+                                     _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+                                    _mm_subs_epu16(p0, p5)),
+                       _mm_or_si128(_mm_subs_epu16(q5, q0),
+                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+                                    _mm_subs_epu16(p0, p6)),
+                       _mm_or_si128(_mm_subs_epu16(q6, q0),
+                                    _mm_subs_epu16(q0, q6)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+                                    _mm_subs_epu16(p0, p7)),
+                       _mm_or_si128(_mm_subs_epu16(q7, q0),
+                                    _mm_subs_epu16(q0, q7)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  if (bd == 8)
+    flat2 = _mm_subs_epu16(flat2, one);
+  else if (bd == 10)
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+  flat2 = _mm_cmpeq_epi16(flat2, zero);
+  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  // end highbd_flat_mask5
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
+  eight = _mm_set1_epi16(8);
+  four = _mm_set1_epi16(4);
+
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+                                _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+                                _mm_add_epi16(q4, q3));
+
+  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                      pixelFilter_q));
+  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                       _mm_add_epi16(pixetFilter_p2p1p0,
+                                                     pixetFilter_q2q1q0));
+  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(q3, q0)), 3);
+
+  sum_p7 = _mm_add_epi16(p7, p7);
+  sum_q7 = _mm_add_epi16(q7, q7);
+  sum_p3 = _mm_add_epi16(p3, p3);
+  sum_q3 = _mm_add_epi16(q3, q3);
+
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+  flat2_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+  flat2_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q1)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  sum_p3 = _mm_add_epi16(sum_p3, p3);
+  sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q2)), 4);
+
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q2)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q3)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q4)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q5)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q6)), 4);
+
+  //  wide flat
+  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  //  highbd_filter8
+  p2 = _mm_andnot_si128(flat, p2);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p2 = _mm_and_si128(flat, flat_p2);
+  //  when (flat && mask)
+  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
+  q2 = _mm_andnot_si128(flat, q2);
+  flat_q2 = _mm_and_si128(flat, flat_q2);
+  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
+
+  ps1 = _mm_andnot_si128(flat, ps1);
+  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p1 = _mm_and_si128(flat, flat_p1);
+  //  when (flat && mask)
+  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
+  qs1 = _mm_andnot_si128(flat, qs1);
+  flat_q1 = _mm_and_si128(flat, flat_q1);
+  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
+
+  ps0 = _mm_andnot_si128(flat, ps0);
+  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p0 = _mm_and_si128(flat, flat_p0);
+  //  when (flat && mask)
+  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
+  qs0 = _mm_andnot_si128(flat, qs0);
+  flat_q0 = _mm_and_si128(flat, flat_q0);
+  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
+  // end highbd_filter8
+
+  // highbd_filter16
+  p6 = _mm_andnot_si128(flat2, p6);
+  //  p6 remains unchanged if !(flat2 && flat && mask)
+  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+  //  get values for when (flat2 && flat && mask)
+  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
+  q6 = _mm_andnot_si128(flat2, q6);
+  //  q6 remains unchanged if !(flat2 && flat && mask)
+  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+  //  get values for when (flat2 && flat && mask)
+  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
+  _mm_store_si128((__m128i *)(s - 7 * p), p6);
+  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+  p5 = _mm_andnot_si128(flat2, p5);
+  //  p5 remains unchanged if !(flat2 && flat && mask)
+  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+  //  get values for when (flat2 && flat && mask)
+  p5 = _mm_or_si128(p5, flat2_p5);
+  //  full list of p5 values
+  q5 = _mm_andnot_si128(flat2, q5);
+  //  q5 remains unchanged if !(flat2 && flat && mask)
+  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+  //  get values for when (flat2 && flat && mask)
+  q5 = _mm_or_si128(q5, flat2_q5);
+  //  full list of q5 values
+  _mm_store_si128((__m128i *)(s - 6 * p), p5);
+  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+  p4 = _mm_andnot_si128(flat2, p4);
+  //  p4 remains unchanged if !(flat2 && flat && mask)
+  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+  //  get values for when (flat2 && flat && mask)
+  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
+  q4 = _mm_andnot_si128(flat2, q4);
+  //  q4 remains unchanged if !(flat2 && flat && mask)
+  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+  //  get values for when (flat2 && flat && mask)
+  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
+  _mm_store_si128((__m128i *)(s - 5 * p), p4);
+  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+  p3 = _mm_andnot_si128(flat2, p3);
+  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+  //  get values for when (flat2 && flat && mask)
+  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
+  q3 = _mm_andnot_si128(flat2, q3);
+  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+  //  get values for when (flat2 && flat && mask)
+  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
+  _mm_store_si128((__m128i *)(s - 4 * p), p3);
+  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+  p2 = _mm_andnot_si128(flat2, p2);
+  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+  //  get values for when (flat2 && flat && mask)
+  p2 = _mm_or_si128(p2, flat2_p2);
+  //  full list of p2 values
+  q2 = _mm_andnot_si128(flat2, q2);
+  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+  //  get values for when (flat2 && flat && mask)
+  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+  p1 = _mm_andnot_si128(flat2, p1);
+  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+  //  get values for when (flat2 && flat && mask)
+  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
+  q1 = _mm_andnot_si128(flat2, q1);
+  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+  //  get values for when (flat2 && flat && mask)
+  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+  p0 = _mm_andnot_si128(flat2, p0);
+  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+  //  get values for when (flat2 && flat && mask)
+  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
+  q0 = _mm_andnot_si128(flat2, q0);
+  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+  //  get values for when (flat2 && flat && mask)
+  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+                                                    int p,
+                                                    const uint8_t *_blimit,
+                                                    const uint8_t *_limit,
+                                                    const uint8_t *_thresh,
+                                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+                                         bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh,
+                                       int count, int bd) {
+  if (count == 1)
+    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  else
+    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i workp_a, workp_b, workp_shft;
+
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_set1_epi16(0x200);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_set1_epi16(0x800);
+  }
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+
+  // filter_mask and hev_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                          _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                          _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                          _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                          _mm_subs_epu16(q1, p1));
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(abs_q1q0, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = signed_char_clamp_bd_sse2(filt, bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = _mm_adds_epi16(filt, t4);
+  filter2 = _mm_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+  filter1 = _mm_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+  filter2 = _mm_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm_andnot_si128(hev, filt);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q0 = _mm_load_si128((__m128i *)flat_oq0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q0 = _mm_and_si128(flat, q0);
+  q0 = _mm_or_si128(work_a, q0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q1 = _mm_load_si128((__m128i *)flat_oq1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q1 = _mm_and_si128(flat, q1);
+  q1 = _mm_or_si128(work_a, q1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q2 = _mm_load_si128((__m128i *)flat_oq2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q2 = _mm_and_si128(flat, q2);
+  q2 = _mm_or_si128(work_a, q2);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p0 = _mm_load_si128((__m128i *)flat_op0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p0 = _mm_and_si128(flat, p0);
+  p0 = _mm_or_si128(work_a, p0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p1 = _mm_load_si128((__m128i *)flat_op1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p1 = _mm_and_si128(flat, p1);
+  p1 = _mm_or_si128(work_a, p1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p2 = _mm_load_si128((__m128i *)flat_op2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p2 = _mm_and_si128(flat, p2);
+  p2 = _mm_or_si128(work_a, p2);
+
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s + 0 * p), q0);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+                                   1, bd);
+}
+
+void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                                        _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                        _mm_subs_epu16(q0, q1));
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                                  _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                                  _mm_subs_epu16(q1, p1));
+  __m128i work;
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  __m128i tff80;
+  __m128i tffe0;
+  __m128i t1f;
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i t7f;
+  // equivalent to shifting 0x7f left by bitdepth - 8
+  // and setting new bits to 1
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+    tff80 = _mm_set1_epi16(0xff80);
+    tffe0 = _mm_set1_epi16(0xffe0);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+  }
+
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // filter4
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm_srli_epi16(filter1, 3);
+  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter2);
+  filter2 = _mm_srli_epi16(filter2, 3);
+  work_a = _mm_and_si128(work_a, tffe0);
+  filter2 = _mm_and_si128(filter2, t1f);
+  filter2 = _mm_or_si128(filter2, work_a);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  work_a = _mm_cmpgt_epi16(zero, filt);
+  filt = _mm_srli_epi16(filt, 1);
+  work_a = _mm_and_si128(work_a, tff80);
+  filt = _mm_and_si128(filt, t7f);
+  filt = _mm_or_si128(filt, work_a);
+
+  filt = _mm_andnot_si128(hev, filt);
+
+  q0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+  q1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  p0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+  p1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+                                   bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+                                    uint16_t *dst[], int out_p,
+                                    int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    uint16_t *in = src[idx8x8];
+    uint16_t *out = dst[idx8x8];
+
+    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13
+    x0 = _mm_unpacklo_epi16(p0, p1);
+    // 20 30 21 31 22 32 23 33
+    x1 = _mm_unpacklo_epi16(p2, p3);
+    // 40 50 41 51 42 52 43 53
+    x2 = _mm_unpacklo_epi16(p4, p5);
+    // 60 70 61 71 62 72 63 73
+    x3 = _mm_unpacklo_epi16(p6, p7);
+    // 00 10 20 30 01 11 21 31
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 40 50 60 70 41 51 61 71
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 00 10 20 30 40 50 60 70
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 01 11 21 31 41 51 61 71
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    // 00 10 20 30 40 50 60 70
+    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    // 01 11 21 31 41 51 61 71
+
+    // 02 12 22 32 03 13 23 33
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 42 52 62 72 43 53 63 73
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 02 12 22 32 42 52 62 72
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    // 02 12 22 32 42 52 62 72
+    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    // 03 13 23 33 43 53 63 73
+
+    // 04 14 05 15 06 16 07 17
+    x0 = _mm_unpackhi_epi16(p0, p1);
+    // 24 34 25 35 26 36 27 37
+    x1 = _mm_unpackhi_epi16(p2, p3);
+    // 44 54 45 55 46 56 47 57
+    x2 = _mm_unpackhi_epi16(p4, p5);
+    // 64 74 65 75 66 76 67 77
+    x3 = _mm_unpackhi_epi16(p6, p7);
+    // 04 14 24 34 05 15 25 35
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 44 54 64 74 45 55 65 75
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 04 14 24 34 44 54 64 74
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 05 15 25 35 45 55 65 75
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    // 04 14 24 34 44 54 64 74
+    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    // 05 15 25 35 45 55 65 75
+
+    // 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 06 16 26 36 46 56 66 76
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    // 06 16 26 36 46 56 66 76
+    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+                                        int in_p, uint16_t *out, int out_p) {
+  uint16_t *src0[1];
+  uint16_t *src1[1];
+  uint16_t *dest0[1];
+  uint16_t *dest1[1];
+  src0[0] = in0;
+  src1[0] = in1;
+  dest0[0] = out;
+  dest1[0] = out + 8;
+  highbd_transpose(src0, in_p, dest0, out_p, 1);
+  highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
+                                     int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  highbd_transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+                                         thresh, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+                                          int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+  //  Transpose 16x16
+  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  //  Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                          thresh, bd);
+
+  //  Transpose back
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..29ec151ed
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
@@ -0,0 +1,962 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 1, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 1, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..93784121c
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,494 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm8, rdx
+    movq        xmm5, rcx
+    pshufd      xmm8, xmm8, 0b
+    movdqa      xmm1, xmm8
+    psllw       xmm8, xmm5
+    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm9, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm9, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+    pminsw      xmm2, xmm8
+    pmaxsw      xmm2, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+%endif
+
+global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
new file mode 100644
index 000000000..ce010df3b
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -0,0 +1,4223 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16(
+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+  input1 = _mm_unpackhi_epi32(input0, input0);
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input3 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, input2);
+    d2 = _mm_add_epi16(d2, input3);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store input0
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store input2
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    // store input3
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+static void idct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+
+  transpose_4x4(in);
+  // stage 1
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
+
+  // stage 2
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+static void iadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8], in7;
+
+  transpose_4x4(in);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(v[3], v[4]);
+  u[2] = v[2];
+  u[3] = _mm_add_epi32(u[0], u[1]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_add_epi32(u[3], v[5]);
+  u[6] = _mm_sub_epi32(u[5], u[4]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[2];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = _mm_loadu_si128((const __m128i *)(input));
+  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+  }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+                         out0, out1, out2, out3) \
+  {                                              \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+    \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+  }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+  {                                            \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+  }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                 out0, out1, out2, out3, out4, out5, out6, out7)  \
+  { \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+  }
+
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+  // 2-D
+  for (i = 0; i < 2; i++) {
+    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+                  in0, in1, in2, in3, in4, in5, in6, in7);
+
+    // 4-stage 1D idct8x8
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+          in0, in1, in2, in3, in4, in5, in6, in7);
+  }
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest + 0 * stride, dc_value);
+  RECON_AND_STORE(dest + 1 * stride, dc_value);
+  RECON_AND_STORE(dest + 2 * stride, dc_value);
+  RECON_AND_STORE(dest + 3 * stride, dc_value);
+  RECON_AND_STORE(dest + 4 * stride, dc_value);
+  RECON_AND_STORE(dest + 5 * stride, dc_value);
+  RECON_AND_STORE(dest + 6 * stride, dc_value);
+  RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+static void idct8_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                in0, in1, in2, in3, in4, in5, in6, in7);
+
+  // 4-stage 1D idct8x8
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+static void iadst8_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  // load input data
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // Rows. Load 4-row input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+
+  // 8x4 Transpose
+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+  // Stage1
+  {
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+  }
+
+  // Stage2
+  {
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+    stp2_4 = tmp0;
+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+        in0, in1, in2, in3, in4, in5, in6, in7);
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+    \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  } \
+  \
+  /* Stage4 */ \
+  { \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+    \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+    \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+    \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+  } \
+    \
+  /* Stage6 */ \
+  { \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+    \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  }
+
+#define IDCT16_10 \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[16], l[16], r[16], *curr1;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+    // 1-D idct
+
+    // Load input data.
+    in[0] = _mm_load_si128((const __m128i *)input);
+    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+
+    IDCT16
+
+    // Stage7
+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    curr1 = r;
+    input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+    int j;
+    // 1-D idct
+    array_transpose_8x8(l + i * 8, in);
+    array_transpose_8x8(r + i * 8, in + 8);
+
+    IDCT16
+
+    // 2-D
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest +  0 * stride, dc_value);
+    RECON_AND_STORE(dest +  1 * stride, dc_value);
+    RECON_AND_STORE(dest +  2 * stride, dc_value);
+    RECON_AND_STORE(dest +  3 * stride, dc_value);
+    RECON_AND_STORE(dest +  4 * stride, dc_value);
+    RECON_AND_STORE(dest +  5 * stride, dc_value);
+    RECON_AND_STORE(dest +  6 * stride, dc_value);
+    RECON_AND_STORE(dest +  7 * stride, dc_value);
+    RECON_AND_STORE(dest +  8 * stride, dc_value);
+    RECON_AND_STORE(dest +  9 * stride, dc_value);
+    RECON_AND_STORE(dest + 10 * stride, dc_value);
+    RECON_AND_STORE(dest + 11 * stride, dc_value);
+    RECON_AND_STORE(dest + 12 * stride, dc_value);
+    RECON_AND_STORE(dest + 13 * stride, dc_value);
+    RECON_AND_STORE(dest + 14 * stride, dc_value);
+    RECON_AND_STORE(dest + 15 * stride, dc_value);
+    dest += 8;
+  }
+}
+
+static void iadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void idct16_8col(__m128i *in) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  t[5] = _mm_packs_epi32(u[0], u[1]);
+  t[6] = _mm_packs_epi32(u[2], u[3]);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  s[8] = t[8];
+  s[9] = t[9];
+
+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  s[10] = _mm_packs_epi32(u[0], u[1]);
+  s[13] = _mm_packs_epi32(u[2], u[3]);
+  s[11] = _mm_packs_epi32(u[4], u[5]);
+  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[14] = t[14];
+  s[15] = t[15];
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+static void idct16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
+}
+
+static void iadst16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
+}
+
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
+  __m128i in0[16], in1[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i in[16], l[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+    stp2_10 = _mm_packs_epi32(tmp0, zero);
+    stp2_13 = _mm_packs_epi32(tmp2, zero);
+    stp2_11 = _mm_packs_epi32(tmp4, zero);
+    stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    int j;
+    array_transpose_4X8(l + 8 * i, in);
+
+    IDCT16_10
+
+    // Stage7
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = _mm_load_si128((const __m128i *) input); \
+    input += 8; \
+  }  \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[32];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 32));
+  in[2] = _mm_load_si128((const __m128i *)(input + 64));
+  in[3] = _mm_load_si128((const __m128i *)(input + 96));
+  in[4] = _mm_load_si128((const __m128i *)(input + 128));
+  in[5] = _mm_load_si128((const __m128i *)(input + 160));
+  in[6] = _mm_load_si128((const __m128i *)(input + 192));
+  in[7] = _mm_load_si128((const __m128i *)(input + 224));
+
+  for (i = 8; i < 32; ++i) {
+    in[i] = _mm_setzero_si128();
+  }
+
+  array_transpose_8x8(in, in);
+  // TODO(hkuang): Following transposes are unnecessary. But remove them will
+  // lead to performance drop on some devices.
+  array_transpose_8x8(in + 8, in + 8);
+  array_transpose_8x8(in + 16, in + 16);
+  array_transpose_8x8(in + 24, in + 24);
+
+  IDCT32_34
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
+    int j;
+    const __m128i zero = _mm_setzero_si128();
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + i * 8, in);
+    IDCT32_34
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[128], zero_idx[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  for (i = 0; i < 4; i++) {
+    i32 = (i << 5);
+    // First 1-D idct
+    // Load input data.
+    LOAD_DQCOEFF(in[0], input);
+    LOAD_DQCOEFF(in[8], input);
+    LOAD_DQCOEFF(in[16], input);
+    LOAD_DQCOEFF(in[24], input);
+    LOAD_DQCOEFF(in[1], input);
+    LOAD_DQCOEFF(in[9], input);
+    LOAD_DQCOEFF(in[17], input);
+    LOAD_DQCOEFF(in[25], input);
+    LOAD_DQCOEFF(in[2], input);
+    LOAD_DQCOEFF(in[10], input);
+    LOAD_DQCOEFF(in[18], input);
+    LOAD_DQCOEFF(in[26], input);
+    LOAD_DQCOEFF(in[3], input);
+    LOAD_DQCOEFF(in[11], input);
+    LOAD_DQCOEFF(in[19], input);
+    LOAD_DQCOEFF(in[27], input);
+
+    LOAD_DQCOEFF(in[4], input);
+    LOAD_DQCOEFF(in[12], input);
+    LOAD_DQCOEFF(in[20], input);
+    LOAD_DQCOEFF(in[28], input);
+    LOAD_DQCOEFF(in[5], input);
+    LOAD_DQCOEFF(in[13], input);
+    LOAD_DQCOEFF(in[21], input);
+    LOAD_DQCOEFF(in[29], input);
+    LOAD_DQCOEFF(in[6], input);
+    LOAD_DQCOEFF(in[14], input);
+    LOAD_DQCOEFF(in[22], input);
+    LOAD_DQCOEFF(in[30], input);
+    LOAD_DQCOEFF(in[7], input);
+    LOAD_DQCOEFF(in[15], input);
+    LOAD_DQCOEFF(in[23], input);
+    LOAD_DQCOEFF(in[31], input);
+
+    // checking if all entries are zero
+    zero_idx[0] = _mm_or_si128(in[0], in[1]);
+    zero_idx[1] = _mm_or_si128(in[2], in[3]);
+    zero_idx[2] = _mm_or_si128(in[4], in[5]);
+    zero_idx[3] = _mm_or_si128(in[6], in[7]);
+    zero_idx[4] = _mm_or_si128(in[8], in[9]);
+    zero_idx[5] = _mm_or_si128(in[10], in[11]);
+    zero_idx[6] = _mm_or_si128(in[12], in[13]);
+    zero_idx[7] = _mm_or_si128(in[14], in[15]);
+    zero_idx[8] = _mm_or_si128(in[16], in[17]);
+    zero_idx[9] = _mm_or_si128(in[18], in[19]);
+    zero_idx[10] = _mm_or_si128(in[20], in[21]);
+    zero_idx[11] = _mm_or_si128(in[22], in[23]);
+    zero_idx[12] = _mm_or_si128(in[24], in[25]);
+    zero_idx[13] = _mm_or_si128(in[26], in[27]);
+    zero_idx[14] = _mm_or_si128(in[28], in[29]);
+    zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    }
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    IDCT32
+
+    // 1_D: Store 32 intermediate results for each 8x32 block.
+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+  }
+  for (i = 0; i < 4; i++) {
+    // Second 1-D idct
+    j = i << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    IDCT32
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 4; ++i) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      RECON_AND_STORE(dest + j * stride, dc_value);
+    }
+    dest += 8;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct4(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp9_highbd_idct4(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
new file mode 100644
index 000000000..984363d40
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -0,0 +1,174 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
+  }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
diff --git a/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
new file mode 100644
index 000000000..2c1060710
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
@@ -0,0 +1,300 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the inverse transformation. Part
+; of the functions are originally derived from the ffmpeg project.
+; Note that the current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+pw_16:      times 8 dw 16
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+%endmacro
+
+TRANSFORM_COEFFS    6270, 15137
+TRANSFORM_COEFFS    3196, 16069
+TRANSFORM_COEFFS   13623,  9102
+
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro IDCT8_1D 0
+  SUM_SUB          0,    4,    9
+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
+  pmulhrsw        m0,  m12
+  pmulhrsw        m4,  m12
+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
+
+  SUM_SUB          1,    5,    9
+  SUM_SUB          7,    3,    9
+  SUM_SUB          0,    6,    9
+  SUM_SUB          4,    2,    9
+  SUM_SUB          3,    5,    9
+  pmulhrsw        m3,  m12
+  pmulhrsw        m5,  m12
+
+  SUM_SUB          0,    7,    9
+  SUM_SUB          4,    3,    9
+  SUM_SUB          2,    5,    9
+  SUM_SUB          6,    1,    9
+
+  SWAP             3,    6
+  SWAP             1,    4
+%endmacro
+
+; This macro handles 8 pixels per line
+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
+  paddw           m%1, m11
+  paddw           m%2, m11
+  psraw           m%1, 5
+  psraw           m%2, 5
+
+  movh            m%3, [outputq]
+  movh            m%4, [outputq + strideq]
+  punpcklbw       m%3, m%5
+  punpcklbw       m%4, m%5
+  paddw           m%3, m%1
+  paddw           m%4, m%2
+  packuswb        m%3, m%5
+  packuswb        m%4, m%5
+  movh               [outputq], m%3
+  movh     [outputq + strideq], m%4
+%endmacro
+
+INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
+  mova     m8, [pd_8192]
+  mova    m11, [pw_16]
+  mova    m12, [pw_11585x2]
+
+  lea      r3, [2 * strideq]
+
+  mova     m0, [inputq +   0]
+  mova     m1, [inputq +  16]
+  mova     m2, [inputq +  32]
+  mova     m3, [inputq +  48]
+  mova     m4, [inputq +  64]
+  mova     m5, [inputq +  80]
+  mova     m6, [inputq +  96]
+  mova     m7, [inputq + 112]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+%endif
diff --git a/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
new file mode 100644
index 000000000..22b573188
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -0,0 +1,667 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+INIT_MMX sse
+cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  punpckldq             m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_MMX sse
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movd                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshufw                m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  add                leftq, 4
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshufw                m2, m2, 0x0
+  pshufw                m3, m3, 0x0
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m2
+  packuswb              m3, m3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movq                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -4
+  punpcklqdq            m2, m2
+  add                leftq, 8
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m3
+  movq      [dstq        ], m2
+  movhps    [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpckhbw             m4, m0, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -8
+  punpcklqdq            m2, m2
+  add                leftq, 16
+  psubw                 m0, m2
+  psubw                 m4, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m5, m2, m0
+  paddw                 m6, m3, m0
+  paddw                 m2, m4
+  paddw                 m3, m4
+  packuswb              m5, m2
+  packuswb              m6, m3
+  mova      [dstq        ], m5
+  mova      [dstq+strideq], m6
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  mova                  m4, [aboveq+16]
+  punpcklbw             m2, m1
+  punpckhbw             m3, m0, m1
+  punpckhbw             m5, m4, m1
+  punpcklbw             m0, m1
+  punpcklbw             m4, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -16
+  punpcklqdq            m2, m2
+  add                leftq, 32
+  psubw                 m0, m2
+  psubw                 m3, m2
+  psubw                 m4, m2
+  psubw                 m5, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m6, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m6, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m6, m6
+  paddw                 m7, m2, m0
+  paddw                 m8, m2, m3
+  paddw                 m9, m2, m4
+  paddw                 m2, m5
+  packuswb              m7, m8
+  packuswb              m9, m2
+  paddw                 m2, m6, m0
+  paddw                 m8, m6, m3
+  mova   [dstq           ], m7
+  paddw                 m7, m6, m4
+  paddw                 m6, m5
+  mova   [dstq        +16], m9
+  packuswb              m2, m8
+  packuswb              m7, m6
+  mova   [dstq+strideq   ], m2
+  mova   [dstq+strideq+16], m7
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif
diff --git a/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
new file mode 100644
index 000000000..88df9b2d1
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -0,0 +1,1036 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+SECTION .text
+
+INIT_MMX ssse3
+cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 4
+  mov                lineq, -2
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  movd      [dstq        ], m1
+  movd      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_MMX ssse3
+cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 8
+  mov                lineq, -4
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 16
+  mov                lineq, -8
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  mova      [dstq        ], m1
+  mova      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 32
+  mov                lineq, -16
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  mova   [dstq           ], m1
+  mova   [dstq        +16], m1
+  mova   [dstq+strideq   ], m2
+  mova   [dstq+strideq+16], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m0, [aboveq]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
+  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
+  pshufb              m0, [GLOBAL(sh_b12345677)]
+  pavgb               m3, m2, m1
+  pxor                m2, m1
+  pand                m2, [GLOBAL(pb_1)]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m0, 8
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m0, [aboveq]
+  mova                m1, [GLOBAL(sh_b12345677)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
+  pavgb               m3, m2, m0
+  pxor                m2, m0
+  pshufb              m0, m1
+  pand                m2, [GLOBAL(pb_1)]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  pshufb              m0, m1
+  lea               dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, dst8, line
+  lea              stride3q, [strideq*3]
+  lea                 dst8q, [dstq+strideq*8]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pavgb                  m3, m2, m0
+  pxor                   m2, m0
+  pshufb                 m0, m1
+  pand                   m2, [GLOBAL(pb_1)]
+  psubb                  m3, m2
+  pavgb                  m0, m3
+
+  ; first 4 lines and first half of 3rd 4 lines
+  mov                 lined, 2
+.loop:
+  mova   [dstq            ], m0
+  movhps [dst8q           ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq   ], m0
+  movhps [dst8q+strideq   ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq*2 ], m0
+  movhps [dst8q+strideq*2 ], m0
+  pshufb                 m0, m1
+  mova   [dstq +stride3q  ], m0
+  movhps [dst8q+stride3q  ], m0
+  pshufb                 m0, m1
+  lea                  dstq, [dstq +strideq*4]
+  lea                 dst8q, [dst8q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; bottom-right 8x8 block
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  lea                  dstq, [dstq+strideq*4]
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m4, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, dst16, line
+  lea              stride3q, [strideq*3]
+  lea                dst16q, [dstq  +strideq*8]
+  lea                dst16q, [dst16q+strideq*8]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
+  pavgb                  m3, m2, m4
+  pxor                   m2, m4
+  palignr                m5, m4, m0, 1
+  palignr                m6, m4, m0, 2
+  pshufb                 m4, m1
+  pand                   m2, [GLOBAL(pb_1)]
+  psubb                  m3, m2
+  pavgb                  m4, m3
+  pavgb                  m3, m0, m6
+  pxor                   m0, m6
+  pand                   m0, [GLOBAL(pb_1)]
+  psubb                  m3, m0
+  pavgb                  m5, m3
+
+  ; write 4x4 lines (and the first half of the second 4x4 lines)
+  mov                  lined, 4
+.loop:
+  mova [dstq               ], m5
+  mova [dstq            +16], m4
+  mova [dst16q             ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq     ], m3
+  mova [dstq  +strideq  +16], m4
+  mova [dst16q+strideq     ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq*2   ], m5
+  mova [dstq  +strideq*2+16], m4
+  mova [dst16q+strideq*2   ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +stride3q    ], m3
+  mova [dstq  +stride3q +16], m4
+  mova [dst16q+stride3q    ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  lea                  dstq, [dstq  +strideq*4]
+  lea                dst16q, [dst16q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; write second half of second 4x4 lines
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+
+  RESTORE_GOT
+  RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
+  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  lea               dstq, [dstq+strideq*2]
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  lea               dstq, [dstq+strideq*4]
+  psrldq              m3, 1
+  psrldq              m4, 1
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m3, m0, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+  pavgb               m0, m3
+
+  mov              lined, 4
+.loop:
+  mova  [dstq          ], m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  mova  [dstq+strideq*2], m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  lea               dstq, [dstq+strideq*4]
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m7, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, line
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  lea              stride3q, [strideq*3]
+  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb                 m3, m7, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+  palignr                m6, m7, m0, 1
+  palignr                m5, m7, m0, 2
+  pavgb                  m7, m3
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+  pavgb                  m0, m6
+
+  mov                 lined, 8
+.loop:
+  mova  [dstq             ], m0
+  mova  [dstq          +16], m7
+  mova  [dstq+strideq     ], m2
+  mova  [dstq+strideq  +16], m4
+  palignr                m3, m7, m0, 1
+  palignr                m5, m4, m2, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m5
+  mova  [dstq+stride3q +16], m4
+  palignr                m0, m7, m3, 1
+  palignr                m2, m4, m5, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+  lea                  dstq, [dstq+strideq*4]
+  dec                 lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]               ; l1, l2, l3, l4
+  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
+  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
+  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
+  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1
+  ; A2 B2 A1 B1
+  ; A3 B3 A2 B2
+  ; A4 B4 A3 B3
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
+  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
+
+  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+stride3q ], m3
+  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq*2], m3
+  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq  ], m3
+  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
+  movd  [dstq          ], m3
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
+  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
+  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
+  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
+  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
+  psrldq              m4, m0, 1                       ; t1-7 [word]
+  psrldq              m5, m0, 2                       ; t2-7 [word]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1
+  ; A2 B2 A1 B1 C1 D1 E1 F1
+  ; A3 B3 A2 B2 A1 B1 C1 D1
+  ; A4 B4 A3 B3 A2 B2 A1 B1
+  ; A5 B5 A4 B4 A3 B3 A2 B2
+  ; A6 B6 A5 B5 A4 B4 A3 B3
+  ; A7 B7 A6 B6 A5 B5 A4 B4
+  ; A8 B8 A7 B7 A6 B6 A5 B5
+  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
+
+  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+
+  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
+  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2                     ; A-H1
+  movq  [dstq          ], m0
+  lea               dstq, [dstq+strideq*4]
+  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
+  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
+  movq  [dstq+strideq*2], m6
+  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
+  movq  [dstq+strideq  ], m6
+  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
+  movq  [dstq          ], m6
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                m0, [leftq]
+  movu                m7, [aboveq-1]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr             m5, m0, m6, 15
+  palignr             m3, m0, m6, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+  pavgb               m5, m0                            ; A1 - Ag
+
+  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
+
+  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
+
+  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  palignr             m2, m1, m6, 14
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m1, m6, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m1, m6, 6
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 2
+  mova  [dstq+strideq*2], m2
+  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
+  mova  [dstq+stride3q ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  palignr             m2, m6, m4, 14
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m6, m4, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m6, m4, 6
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 2
+  mova  [dstq+strideq*2], m2
+  mova  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                  m0, [leftq]
+  movu                  m7, [aboveq-1]
+  movu                  m1, [aboveq+15]
+
+  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
+
+  palignr               m3, m1, m7, 1
+  palignr               m5, m1, m7, 2
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
+
+  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr               m5, m0, m7, 15
+  palignr               m3, m0, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pavgb                 m5, m0                            ; A1 - Ag
+  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
+  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
+
+  DEFINE_ARGS dst, stride, stride3, left, line
+  lea             stride3q, [strideq*3]
+
+  palignr               m5, m2, m1, 14
+  palignr               m7, m1, m6, 14
+  mova  [dstq            ], m7
+  mova  [dstq+16         ], m5
+  palignr               m5, m2, m1, 12
+  palignr               m7, m1, m6, 12
+  mova  [dstq+strideq    ], m7
+  mova  [dstq+strideq+16 ], m5
+  palignr                m5, m2, m1, 10
+  palignr                m7, m1, m6, 10
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m2, m1, 8
+  palignr                m7, m1, m6, 8
+  mova  [dstq+stride3q    ], m7
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m2, m1, 6
+  palignr                m7, m1, m6, 6
+  mova  [dstq             ], m7
+  mova  [dstq+16          ], m5
+  palignr                m5, m2, m1, 4
+  palignr                m7, m1, m6, 4
+  mova  [dstq+strideq     ], m7
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m2, m1, 2
+  palignr                m7, m1, m6, 2
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m6
+  mova  [dstq+stride3q+16 ], m1
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m5, m1, m6, 14
+  palignr                m3, m6, m4, 14
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 12
+  palignr                m3, m6, m4, 12
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 10
+  palignr                m3, m6, m4, 10
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m1, m6, 8
+  palignr                m3, m6, m4, 8
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m1, m6, 6
+  palignr                m3, m6, m4, 6
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 4
+  palignr                m3, m6, m4, 4
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 2
+  palignr                m3, m6, m4, 2
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m4
+  mova  [dstq+stride3q+16 ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  mova                   m7, [leftq]
+  mova                   m3, [leftq+16]
+  palignr                m5, m3, m7, 15
+  palignr                m0, m3, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
+  pavgb                  m5, m3                            ; Ah -
+  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
+  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
+  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
+
+  palignr                m7, m6, m4, 14
+  palignr                m0, m4, m3, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 12
+  palignr                m0, m4, m3, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 10
+  palignr                m0, m4, m3, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m6, m4, 8
+  palignr                m0, m4, m3, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m6, m4, 6
+  palignr                m0, m4, m3, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 4
+  palignr                m0, m4, m3, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 2
+  palignr                m0, m4, m3, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m4
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m7, m4, m3, 14
+  palignr                m0, m3, m2, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 12
+  palignr                m0, m3, m2, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 10
+  palignr                m0, m3, m2, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m4, m3, 8
+  palignr                m0, m3, m2, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m4, m3, 6
+  palignr                m0, m3, m2, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 4
+  palignr                m0, m3, m2, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 2
+  palignr                m0, m3, m2, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m2
+  mova  [dstq+stride3q+16 ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]                ; abcd [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+  pavgb               m1, m0             ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  pshufw              m1, m1, q1111      ; d, d, d, d
+  movd    [dstq+strideq], m1
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  movq                m3, [leftq]            ; abcdefgh [byte]
+  lea           stride3q, [strideq*3]
+
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+  pavgb               m0, m2
+  punpcklbw           m0, m3        ; interleaved output
+
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  lea               dstq, [dstq+strideq*4]
+  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+  psrldq              m0, 2
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
+
+  punpckhbw           m4, m1, m3    ; interleaved input
+  punpcklbw           m1, m3        ; interleaved output
+  mova  [dstq          ], m1
+  palignr             m3, m4, m1, 2
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 4
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 6
+  mova  [dstq+stride3q ], m3
+  lea               dstq, [dstq+strideq*4]
+  palignr             m3, m4, m1, 8
+  mova  [dstq          ], m3
+  palignr             m3, m4, m1, 10
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 12
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 14
+  mova  [dstq+stride3q ], m3
+  DEFINE_ARGS dst, stride, stride3, line
+  mov              lined, 2
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq          ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq*2], m4
+  pshufb              m4, m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m4, m0
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m1, [leftq]              ;  0-15 [byte]
+  mova                m2, [leftq+16]           ; 16-31 [byte]
+  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+  palignr             m6, m2, m1, 1
+  palignr             m5, m2, m1, 2
+  pavgb               m2, m4         ; high 16px even lines
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+  pavgb                   m1, m6         ; low 16px even lines
+
+  punpckhbw               m6, m1, m0               ; interleaved output 2
+  punpcklbw               m1, m0                   ; interleaved output 1
+
+  punpckhbw               m7, m2, m3               ; interleaved output 4
+  punpcklbw               m2, m3                   ; interleaved output 3
+
+  ; output 1st 8 lines (and half of 2nd 8 lines)
+  DEFINE_ARGS dst, stride, stride3, dst8
+  lea                  dst8q, [dstq+strideq*8]
+  mova  [dstq              ], m1
+  mova  [dstq           +16], m6
+  mova  [dst8q             ], m6
+  palignr             m0, m6, m1, 2
+  palignr             m4, m2, m6, 2
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 4
+  palignr             m4, m2, m6, 4
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 6
+  palignr             m4, m2, m6, 6
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq +strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m0, m6, m1, 8
+  palignr             m4, m2, m6, 8
+  mova  [dstq              ], m0
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m0, m6, m1, 10
+  palignr             m4, m2, m6, 10
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 12
+  palignr             m4, m2, m6, 12
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 14
+  palignr             m4, m2, m6, 14
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+  mova  [dstq           +16], m2
+  mova  [dst8q             ], m2
+  palignr             m4, m7, m2, 2
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 4
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 6
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m4, m7, m2, 8
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m4, m7, m2, 10
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 12
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 14
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+
+  ; output last half of 4th 8 lines
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+
+  ; done!
+  RESTORE_GOT
+  RET
diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 000000000..770a65f4c
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,986 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+#include "./vp9_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
new file mode 100644
index 000000000..e321dbebe
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -0,0 +1,1580 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit,
+                                            const unsigned char *_limit,
+                                            const unsigned char *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i abs_p1p0;
+
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+                                       (__m64 *)(s + 4 * p)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+                                       (__m64 *)(s + 3 * p)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+                                       (__m64 *)(s + 2 * p)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+                                       (__m64 *)(s + 1 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+                                       (__m64 *)(s - 0 * p)));
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi16(0x1);
+    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+    __m128i qs0 = _mm_xor_si128(p0q0, t80);
+    __m128i qs1 = _mm_xor_si128(p1q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, qs0ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 0xB);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 0xB);
+
+    // Filter1 >> 3
+    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+    // filt >> 1
+    filt = _mm_adds_epi16(filter1, t1);
+    filt = _mm_srai_epi16(filt, 1);
+    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+                            filt);
+    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+    // loopfilter done
+
+    {
+      __m128i work;
+      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+      flat = _mm_max_epu8(abs_p1p0, flat);
+      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                                           (__m64 *)(s + 5 * p)));
+
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                                           (__m64 *)(s + 6 * p)));
+      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
+
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                                           (__m64 *)(s + 7 * p)));
+      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+      __m128i pixelFilter_p, pixelFilter_q;
+      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                                    _mm_add_epi16(p4_16, p3_16));
+      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                                    _mm_add_epi16(q4_16, q3_16));
+
+      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                         pixelFilter_q));
+      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                           _mm_add_epi16(pixetFilter_p2p1p0,
+                                                         pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(q7_16, q0_16)), 4);
+      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(q3_16, q0_16)), 3);
+
+      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(p7_16, p7_16);
+      sum_q7 = _mm_add_epi16(q7_16, q7_16);
+      sum_p3 = _mm_add_epi16(p3_16, p3_16);
+      sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q1_16)), 4);
+      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                             _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                             _mm_add_epi16(sum_q3, q1_16)), 3);
+      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q2_16)), 4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                           _mm_add_epi16(sum_q3, q2_16)), 3);
+      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q3_16)), 4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q4_16)), 4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q5_16)), 4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q6_16)), 4);
+      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    flat = _mm_shuffle_epi32(flat, 68);
+    flat2 = _mm_shuffle_epi32(flat2, 68);
+
+    q2p2 = _mm_andnot_si128(flat, q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    q6p6 = _mm_andnot_si128(flat2, q6p6);
+    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+    q5p5 = _mm_andnot_si128(flat2, q5p5);
+    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+    q4p4 = _mm_andnot_si128(flat2, q4p4);
+    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+    q3p3 = _mm_andnot_si128(flat2, q3p3);
+    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+    q2p2 = _mm_andnot_si128(flat2, q2p2);
+    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+    q1p1 = _mm_andnot_si128(flat2, q1p1);
+    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+    q0p0 = _mm_andnot_si128(flat2, q0p0);
+    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
+  }
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+                                       const __m128i *const a1,
+                                       const __m128i *const a2,
+                                       const __m128i *const s1,
+                                       const __m128i *const s2) {
+  __m128i x = _mm_add_epi16(*a1, *total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+  return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+                                   const __m128i *const other_filt,
+                                   const __m128i *const f8_lo,
+                                   const __m128i *const f8_hi) {
+  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
+                                      _mm_srli_epi16(*f8_hi, 3));
+  const __m128i result = _mm_and_si128(*flat, f8);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+                                    const __m128i *const other_filt,
+                                    const __m128i *const f_lo,
+                                    const __m128i *const f_hi) {
+  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
+                                     _mm_srli_epi16(*f_hi, 4));
+  const __m128i result = _mm_and_si128(*flat, f);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
+                                             int p,
+                                             const unsigned char *_blimit,
+                                             const unsigned char *_limit,
+                                             const unsigned char *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+    // loopfilter done
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
+
+      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
+
+      __m128i f_lo;
+      __m128i f_hi;
+
+      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
+      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
+                           _mm_add_epi16(p4_lo, f_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+                           _mm_add_epi16(p2_lo, p1_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
+
+      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
+      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
+                           _mm_add_epi16(p4_hi, f_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+                           _mm_add_epi16(p2_hi, p1_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
+
+      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  }
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh, int count) {
+  if (count == 1)
+    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
+  else
+    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
+}
+
+void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh, int count) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+
+  (void)count;
+
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    // filter_mask and hev_mask
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
+                        abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+    }
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 11);
+    filter1 = _mm_packs_epi16(filter1, filter1);
+
+    // Filter2 >> 3
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 11);
+    filter2 = _mm_packs_epi16(filter2, zero);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    filt = _mm_unpacklo_epi8(zero, filt);
+    filt = _mm_srai_epi16(filt, 9);
+    filt = _mm_packs_epi16(filt, zero);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+                                    const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    // filter_mask and hev_mask
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    int i = 0;
+
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, hev, flat;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  // filter_mask and hev_mask
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  }
+}
+
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+                                 int in_p, unsigned char *out, int out_p) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+  // Read in 16 lines
+  x0 = _mm_loadl_epi64((__m128i *)in0);
+  x8 = _mm_loadl_epi64((__m128i *)in1);
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
+
+  x0 = _mm_unpacklo_epi8(x0, x1);
+  x1 = _mm_unpacklo_epi8(x2, x3);
+  x2 = _mm_unpacklo_epi8(x4, x5);
+  x3 = _mm_unpacklo_epi8(x6, x7);
+
+  x8 = _mm_unpacklo_epi8(x8, x9);
+  x9 = _mm_unpacklo_epi8(x10, x11);
+  x10 = _mm_unpacklo_epi8(x12, x13);
+  x11 = _mm_unpacklo_epi8(x14, x15);
+
+  x4 = _mm_unpacklo_epi16(x0, x1);
+  x5 = _mm_unpacklo_epi16(x2, x3);
+  x12 = _mm_unpacklo_epi16(x8, x9);
+  x13 = _mm_unpacklo_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store first 4-line result
+  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+  x4 = _mm_unpackhi_epi16(x0, x1);
+  x5 = _mm_unpackhi_epi16(x2, x3);
+  x12 = _mm_unpackhi_epi16(x8, x9);
+  x13 = _mm_unpackhi_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store second 4-line result
+  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose(unsigned char *src[], int in_p,
+                             unsigned char *dst[], int out_p,
+                             int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 0*out_p),
+                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
+    _mm_storeh_pd((double *)(out + 1*out_p),
+                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    _mm_storel_pd((double *)(out + 2*out_p),
+                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
+    _mm_storeh_pd((double *)(out + 3*out_p),
+                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 4*out_p),
+                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
+    _mm_storeh_pd((double *)(out + 5*out_p),
+                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    _mm_storel_pd((double *)(out + 6*out_p),
+                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
+    _mm_storeh_pd((double *)(out + 7*out_p),
+                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *blimit,
+                             const unsigned char *limit,
+                             const unsigned char *thresh, int count) {
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+                              const unsigned char *blimit,
+                              const unsigned char *limit,
+                              const unsigned char *thresh) {
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
new file mode 100644
index 000000000..f5f7d5af7
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -0,0 +1,611 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp9_lpf_horizontal_4_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
+sym(vp9_lpf_horizontal_4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_lpf_vertical_4_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp9_lpf_vertical_4_mmx) PRIVATE
+sym(vp9_lpf_vertical_4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [GLOBAL(ones)]
+
+        paddsw      mm1,        [GLOBAL(ones)]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
+        ; mm7 = q1
+
+        ; transpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         .next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
diff --git a/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 000000000..6029420d1
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
diff --git a/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm
new file mode 100644
index 000000000..ec8bfdb18
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm
@@ -0,0 +1,694 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_post_proc_down_and_across_xmm
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
+sym(vp9_post_proc_down_and_across_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ALIGN_STACK 16, rax
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movdqa      xmm0, [GLOBAL(rd42)]
+    sub         rsp, 16
+    movdqa      [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [GLOBAL(rd42)]
+%endif
+
+
+        movd        xmm2,       dword ptr arg(6) ;flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx,        DWORD PTR arg(4) ;rows
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        xmm0,       xmm0              ; mm0 = 00000000
+
+.nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+.nextcol:
+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        neg         rax
+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        paddusw     xmm3,       RD42                    ; mm3 += round value
+        psraw       xmm3,       3                       ; mm3 /= 8
+
+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ; combination
+
+        packuswb    xmm1,       xmm0                    ; pack to bytes
+        movq        QWORD PTR [rdi], xmm1             ;
+
+        neg         rax                   ; pitch is positive
+        add         rsi,        8
+        add         rdi,        8
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+
+        jl          .nextcol
+
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movq        xmm7,       QWORD PTR [rdi +rdx -2]
+        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        psrldq      xmm4,       1                   ; mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        paddusw     xmm3,       RD42              ; mm3 += round value
+        psraw       xmm3,       3                 ; mm3 /= 8
+
+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ; combination
+
+        packuswb    xmm1,       xmm0              ; pack to bytes
+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+        jl          .acrossnextcol;
+
+        ; last 8 pixels
+        movq        QWORD PTR [rdi+rdx-8],  mm0
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         .nextrow              ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    add rsp,16
+    pop rsp
+%endif
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD42
+
+
+;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_xmm) PRIVATE
+sym(vp9_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp9_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp9_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp9_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+        mov         rdi,    -8
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int width, unsigned int height, int pitch)
+global sym(vp9_plane_add_noise_wmt) PRIVATE
+sym(vp9_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..cee8d1e76
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,602 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+#include "./vp9_rtcd.h"
+
+#include <immintrin.h>
+
+#include "vp9/common/x86/convolve.h"
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#if defined(__clang__)
+# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+      (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# else  // clang > 3.3, and not 5.0 on macosx.
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // clang <= 3.3
+#elif defined(__GNUC__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+# else  // gcc > 4.7
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // gcc <= 4.6
+#else  // !(gcc || clang)
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i-=2) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+                 _mm_loadu_si128((const __m128i *)
+                 (src_ptr+src_pixels_per_line-3)), 1);
+
+    // filter the source buffer
+    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+                 _mm_loadu_si128((const __m128i *)
+                 (src_ptr+src_pixels_per_line+5)), 1);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+                                           srcRegFilt32b2_1);
+
+    src_ptr+=src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr,
+    _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr+=dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+                  _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr)));
+  srcReg32b2 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+  srcReg32b3 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+  srcReg32b4 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+  srcReg32b5 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+  srcReg32b6 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+  srcReg32b7 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+               _mm256_castsi256_si128(srcReg32b2), 1);
+  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+               _mm256_castsi256_si128(srcReg32b3), 1);
+  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+               _mm256_castsi256_si128(srcReg32b4), 1);
+  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+               _mm256_castsi256_si128(srcReg32b5), 1);
+  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+               _mm256_castsi256_si128(srcReg32b6), 1);
+  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+               _mm256_castsi256_si128(srcReg32b7), 1);
+
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  // save
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+  for (i = output_height; i > 1; i-=2) {
+     // load the last 2 loads of 16 bytes and have every two
+     // consecutive loads in the same 256 bit register
+     srcReg32b8 = _mm256_castsi128_si256(
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+     _mm256_castsi256_si128(srcReg32b8), 1);
+     srcReg32b9 = _mm256_castsi128_si256(
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+     _mm256_castsi256_si128(srcReg32b9), 1);
+
+     // merge every two consecutive registers
+     // save
+     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+     // shift by 7 bit each 16 bit
+     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+     // shrink to 8 bit each 16 bits, the first lane contain the first
+     // convolve result and the second lane contain the second convolve
+     // result
+     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+     src_ptr+=src_stride;
+
+     // save 16 bytes
+     _mm_store_si128((__m128i*)output_ptr,
+     _mm256_castsi256_si128(srcReg32b1));
+
+     // save the next 16 bits
+     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+     _mm256_extractf128_si256(srcReg32b1, 1));
+
+     output_ptr+=dst_stride;
+
+     // save part of the registers for next strides
+     srcReg32b10 = srcReg32b11;
+     srcReg32b1 = srcReg32b3;
+     srcReg32b11 = srcReg32b2;
+     srcReg32b3 = srcReg32b5;
+     srcReg32b2 = srcReg32b4;
+     srcReg32b5 = srcReg32b7;
+     srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 = _mm_unpacklo_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 = _mm_unpackhi_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                  _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+  }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif  // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 000000000..5fd2857e1
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,601 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+#include "./vp9_rtcd.h"
+
+#include <tmmintrin.h>
+
+#include "vp9/common/x86/convolve.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+
+void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pixels_per_line,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t output_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pitch,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t out_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 16 bytes
+  srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 16 bytes
+    srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // merge the result together
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif  // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
+
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..9dc8d0abb
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 1, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
new file mode 100644
index 000000000..4a5bf1b60
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -0,0 +1,1071 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro VERTx4 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.loop:
+    movd        xmm0, [rsi]                 ;A
+    movd        xmm1, [rsi + rdx]           ;B
+    movd        xmm2, [rsi + rdx * 2]       ;C
+    movd        xmm3, [rax + rdx * 2]       ;D
+    movd        xmm4, [rsi + rdx * 4]       ;E
+    movd        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movd        xmm6, [rsi + rbx]           ;G
+    movd        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    movdqa      xmm1, xmm2
+    paddsw      xmm0, xmm6
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
+    paddsw      xmm0, xmm4
+    paddsw      xmm0, xmm2
+
+    paddsw      xmm0, krd
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .loop
+%endm
+
+%macro VERTx8 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.loop:
+    movq        xmm0, [rsi]                 ;A
+    movq        xmm1, [rsi + rdx]           ;B
+    movq        xmm2, [rsi + rdx * 2]       ;C
+    movq        xmm3, [rax + rdx * 2]       ;D
+    movq        xmm4, [rsi + rdx * 4]       ;E
+    movq        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movq        xmm6, [rsi + rbx]           ;G
+    movq        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm6
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
+    paddsw      xmm0, xmm4
+    paddsw      xmm0, xmm2
+
+    paddsw      xmm0, krd
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .loop
+%endm
+
+
+%macro VERTx16 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.loop:
+    movq        xmm0, [rsi]                 ;A
+    movq        xmm1, [rsi + rdx]           ;B
+    movq        xmm2, [rsi + rdx * 2]       ;C
+    movq        xmm3, [rax + rdx * 2]       ;D
+    movq        xmm4, [rsi + rdx * 4]       ;E
+    movq        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movq        xmm6, [rsi + rbx]           ;G
+    movq        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm6
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
+    paddsw      xmm0, xmm4
+    paddsw      xmm0, xmm2
+
+    paddsw      xmm0, krd
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+
+    movq        xmm0, [rsi + 8]             ;A
+    movq        xmm1, [rsi + rdx + 8]       ;B
+    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
+    movq        xmm3, [rax + rdx * 2 + 8]   ;D
+    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
+    movq        xmm5, [rax + rdx * 4 + 8]   ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movq        xmm6, [rsi + rbx + 8]       ;G
+    movq        xmm7, [rax + rbx + 8]       ;H
+    punpcklbw   xmm6, xmm7                  ;G H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm6
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
+    paddsw      xmm0, xmm4
+    paddsw      xmm0, xmm2
+
+    paddsw      xmm0, krd
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+%if %1
+    movq    xmm1, [rdi+8]
+    pavgb   xmm0, xmm1
+%endif
+
+    movq        [rdi+8], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx16 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx16 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%macro HORIZx4_ROW 2
+    movdqa      %2,   %1
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   %1,   k0k1k4k5
+    pmaddubsw   %2,   k2k3k6k7
+
+    movdqa      xmm4, %1
+    movdqa      xmm5, %2
+    psrldq      %1,   8
+    psrldq      %2,   8
+    movdqa      xmm6, xmm5
+
+    paddsw      xmm4, %2
+    pmaxsw      xmm5, %1
+    pminsw      %1, xmm6
+    paddsw      %1, xmm4
+    paddsw      %1, xmm5
+
+    paddsw      %1,   krd
+    psraw       %1,   7
+    packuswb    %1,   %1
+%endm
+
+%macro HORIZx4 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm6, xmm4, 0b              ;k0_k1
+    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
+    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
+    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
+    pshufd      xmm5, xmm5, 0               ;rounding
+
+    movdqa      k0k1k4k5, xmm6
+    movdqa      k2k3k6k7, xmm7
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+    shr         rcx, 1
+.loop:
+    ;Do two rows once
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm1,   [rsi + 5]
+    movq        xmm2,   [rsi + rax - 3]
+    movq        xmm3,   [rsi + rax + 5]
+    punpcklqdq  xmm0,   xmm1
+    punpcklqdq  xmm2,   xmm3
+
+    HORIZx4_ROW xmm0,   xmm1
+    HORIZx4_ROW xmm2,   xmm3
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+    movd        xmm3,   [rdi + rdx]
+    pavgb       xmm2,   xmm3
+%endif
+    movd        [rdi],  xmm0
+    movd        [rdi +rdx],  xmm2
+
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
+
+    dec         rcx
+    jnz         .loop
+
+    ; Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)       ;output_height
+    and         rcx,    1
+    je          .done
+
+    movq        xmm0,   [rsi - 3]    ; load src
+    movq        xmm1,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm1
+
+    HORIZx4_ROW xmm0, xmm1
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
+    movd        [rdi],  xmm0
+.done
+%endm
+
+%macro HORIZx8_ROW 4
+    movdqa      %2,   %1
+    movdqa      %3,   %1
+    movdqa      %4,   %1
+
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pshufb      %3,   [GLOBAL(shuf_t4t5)]
+    pshufb      %4,   [GLOBAL(shuf_t6t7)]
+
+    pmaddubsw   %1,   k0k1
+    pmaddubsw   %2,   k2k3
+    pmaddubsw   %3,   k4k5
+    pmaddubsw   %4,   k6k7
+
+    paddsw      %1,   %4
+    movdqa      %4,   %2
+    pmaxsw      %2,   %3
+    pminsw      %3,   %4
+    paddsw      %1,   %3
+    paddsw      %1,   %2
+
+    paddsw      %1,   krd
+    psraw       %1,   7
+    packuswb    %1,   %1
+%endm
+
+%macro HORIZx8 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+    shr         rcx, 1
+
+.loop:
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm3,   [rsi + 5]
+    movq        xmm4,   [rsi + rax - 3]
+    movq        xmm7,   [rsi + rax + 5]
+    punpcklqdq  xmm0,   xmm3
+    punpcklqdq  xmm4,   xmm7
+
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
+%if %1
+    movq        xmm1,   [rdi]
+    movq        xmm2,   [rdi + rdx]
+    pavgb       xmm0,   xmm1
+    pavgb       xmm4,   xmm2
+%endif
+    movq        [rdi],  xmm0
+    movq        [rdi + rdx],  xmm4
+
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
+    dec         rcx
+    jnz         .loop
+
+    ;Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)    ;output_height
+    and         rcx,    1
+    je          .done
+
+    movq        xmm0,   [rsi - 3]
+    movq        xmm3,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm3
+
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+%if %1
+    movq        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
+    movq        [rdi],  xmm0
+.done
+%endm
+
+%macro HORIZx16 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movq        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+
+.loop:
+    prefetcht0  [rsi + 2 * rax -3]
+
+    movq        xmm0,   [rsi - 3]           ;load src data
+    movq        xmm4,   [rsi + 5]
+    movq        xmm6,   [rsi + 13]
+    punpcklqdq  xmm0,   xmm4
+    punpcklqdq  xmm4,   xmm6
+
+    movdqa      xmm7,   xmm0
+
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
+    movdqa      xmm1,   xmm0
+    movdqa      xmm2,   xmm0
+    movdqa      xmm3,   xmm0
+
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
+    pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
+    pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
+    pmaddubsw   xmm2,   k4k5
+    pmaddubsw   xmm3,   k6k7
+    paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
+
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13
+
+    movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
+    pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
+    pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
+    paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
+    paddsw      xmm0,   xmm1
+
+    paddsw      xmm4,   xmm7
+    movdqa      xmm7,   xmm5
+    pmaxsw      xmm5,   xmm6
+    pminsw      xmm6,   xmm7
+    paddsw      xmm4,   xmm6
+    paddsw      xmm4,   xmm5
+
+    paddsw      xmm0,   krd
+    paddsw      xmm4,   krd
+    psraw       xmm0,   7
+    psraw       xmm4,   7
+    packuswb    xmm0,   xmm0
+    packuswb    xmm4,   xmm4
+    punpcklqdq  xmm0,   xmm4
+%if %1
+    movdqa      xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
+
+    lea         rsi,    [rsi + rax]
+    movdqa      [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d4_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
+    HORIZx4 0
+
+    add rsp, 16 * 3
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 0
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 0
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
+    HORIZx4 1
+
+    add rsp, 16 * 3
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+SECTION_RODATA
+align 16
+shuf_t0t1:
+    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+align 16
+shuf_t2t3:
+    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+align 16
+shuf_t4t5:
+    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+align 16
+shuf_t6t7:
+    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..d94ccf2e9
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -0,0 +1,448 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
+sym(vp9_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
+sym(vp9_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
+sym(vp9_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
+sym(vp9_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
+sym(vp9_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
+sym(vp9_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..b5e18fe6d
--- /dev/null
+++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret