1 files changed, 334 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 000000000..5166e9e0a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
+  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
+  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
+  temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
+  temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
+  temp1 = _mm_unpacklo_epi32(temp1, temp2);
+  return _mm_unpacklo_epi64(temp3, temp1);
+}
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_ssse3(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *msk, int msk_stride) {                                    \
+    return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
+                            m, n);                                             \
+  }
+
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
+
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
+
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 16
+// Assumes values in m are <=64
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 16) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu8(a, b);
+      temp2 = _mm_subs_epu8(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Multiply by m and add together
+      temp2 = _mm_maddubs_epi16(temp1, m);
+      // Pad out row result to 32 bit integers & add to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = width8_load_2rows(a_ptr, a_stride);
+    b = width8_load_2rows(b_ptr, b_stride);
+    m = width8_load_2rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 4 rows at a time
+  for (y = 0; y < height; y += 4) {
+    // Load a, b, m in xmm registers
+    a = width4_load_4rows(a_ptr, a_stride);
+    b = width4_load_4rows(b_ptr, b_stride);
+    m = width4_load_4rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 4;
+    b_ptr += b_stride * 4;
+    m_ptr += m_stride * 4;
+  }
+  // Pad out row result to 32 bit integers & add to running total
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
+                                               int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height);
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
+                                   msk_stride, m, n);                         \
+  }
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
+  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
+                                      msk_stride, n);                         \
+  }
+
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 8
+// Assumes values in m are <=64
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 8) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
+                            _mm_setzero_si128());
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu16(a, b);
+      temp2 = _mm_subs_epu16(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Add result of multiplying by m and add pairs together to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = highbd_width4_load_2rows(a_ptr, a_stride);
+    b = highbd_width4_load_2rows(b_ptr, b_stride);
+    temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
+    temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
+    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
+                          _mm_setzero_si128());
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu16(a, b);
+    temp2 = _mm_subs_epu16(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+#endif  // CONFIG_HIGHBITDEPTH