1 files changed, 171 insertions, 47 deletions
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
index 79677c92f..a4c3616e7 100644
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <stdlib.h>
+#include <string.h>
+#include <assert.h>
 
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
@@ -20,6 +22,9 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/blend.h"
 
+#include "./av1_rtcd.h"
+#include "av1/common/filter.h"
+
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
   int distortion = 0;
@@ -246,6 +251,13 @@ VARIANCES(4, 2)
 VARIANCES(2, 4)
 VARIANCES(2, 2)
 
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
+VARIANCES(4, 16)
+VARIANCES(16, 4)
+VARIANCES(8, 32)
+VARIANCES(32, 8)
+#endif
+
 GET_VAR(16, 16)
 GET_VAR(8, 8)
 
@@ -271,33 +283,66 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 
 // Get pred block from up-sampled reference.
 void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
-                          const uint8_t *ref, int ref_stride) {
-  int i, j, k;
-  int stride = ref_stride << 3;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0, k = 0; j < width; j++, k += 8) {
-      comp_pred[j] = ref[k];
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride) {
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    int i;
+    for (i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    InterpFilterParams filter;
+    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
+    if (!subpel_y_q3) {
+      const int16_t *kernel;
+      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+      /*Directly call C version to allow this to work for small (2x2) sizes.*/
+      aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                            -1, width, height);
+    } else if (!subpel_x_q3) {
+      const int16_t *kernel;
+      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+      /*Directly call C version to allow this to work for small (2x2) sizes.*/
+      aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                           16, width, height);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t,
+                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+      const int16_t *kernel_x;
+      const int16_t *kernel_y;
+      int intermediate_height;
+      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+      intermediate_height =
+          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
+      aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
+                            ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
+                            -1, width, intermediate_height);
+      aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                           MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
+                           16, width, height);
     }
-    comp_pred += width;
-    ref += stride;
   }
 }
 
 void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, const uint8_t *ref,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
                                    int ref_stride) {
   int i, j;
-  int stride = ref_stride << 3;
 
+  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
+                     ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
-      const int tmp = ref[(j << 3)] + pred[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
     }
     comp_pred += width;
     pred += width;
-    ref += stride;
   }
 }
 
@@ -611,6 +656,13 @@ HIGHBD_VARIANCES(4, 2)
 HIGHBD_VARIANCES(2, 4)
 HIGHBD_VARIANCES(2, 2)
 
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
+HIGHBD_VARIANCES(4, 16)
+HIGHBD_VARIANCES(16, 4)
+HIGHBD_VARIANCES(8, 32)
+HIGHBD_VARIANCES(32, 8)
+#endif
+
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 
@@ -637,37 +689,74 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
 }
 
 void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
-                                 const uint8_t *ref8, int ref_stride) {
-  int i, j;
-  int stride = ref_stride << 3;
-
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = ref[(j << 3)];
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd) {
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref;
+    int i;
+    ref = CONVERT_TO_SHORTPTR(ref8);
+    for (i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    InterpFilterParams filter;
+    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
+    if (!subpel_y_q3) {
+      const int16_t *kernel;
+      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+      /*Directly call C version to allow this to work for small (2x2) sizes.*/
+      aom_highbd_convolve8_horiz_c(ref8, ref_stride,
+                                   CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
+                                   16, NULL, -1, width, height, bd);
+    } else if (!subpel_x_q3) {
+      const int16_t *kernel;
+      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+      /*Directly call C version to allow this to work for small (2x2) sizes.*/
+      aom_highbd_convolve8_vert_c(ref8, ref_stride,
+                                  CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
+                                  -1, kernel, 16, width, height, bd);
+    } else {
+      DECLARE_ALIGNED(16, uint16_t,
+                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+      const int16_t *kernel_x;
+      const int16_t *kernel_y;
+      int intermediate_height;
+      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+      intermediate_height =
+          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
+      aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                                   ref_stride, CONVERT_TO_BYTEPTR(temp),
+                                   MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                                   intermediate_height, bd);
+      aom_highbd_convolve8_vert_c(
+          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+          16, width, height, bd);
     }
-    comp_pred += width;
-    ref += stride;
   }
 }
 
 void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
                                           const uint8_t *pred8, int width,
-                                          int height, const uint8_t *ref8,
-                                          int ref_stride) {
+                                          int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref8,
+                                          int ref_stride, int bd) {
   int i, j;
-  int stride = ref_stride << 3;
 
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
+                            ref8, ref_stride, bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
-      const int tmp = pred[j] + ref[(j << 3)];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
     }
     comp_pred += width;
     pred += width;
-    ref += stride;
   }
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -694,22 +783,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, const uint8_t *ref,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
                                     int ref_stride, const uint8_t *mask,
                                     int mask_stride, int invert_mask) {
   int i, j;
-  int stride = ref_stride << 3;
 
+  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
+                     ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[(j << 3)], pred[j]);
+        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
       else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[(j << 3)]);
+        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
     }
     comp_pred += width;
     pred += width;
-    ref += stride;
     mask += mask_stride;
   }
 }
@@ -753,6 +843,13 @@ MASK_SUBPIX_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 
+#if CONFIG_EXT_PARTITION_TYPES
+MASK_SUBPIX_VAR(4, 16)
+MASK_SUBPIX_VAR(16, 4)
+MASK_SUBPIX_VAR(8, 32)
+MASK_SUBPIX_VAR(32, 8)
+#endif
+
 #if CONFIG_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
                                  int width, int height, const uint8_t *ref8,
@@ -775,26 +872,24 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred,
-                                           const uint8_t *pred8, int width,
-                                           int height, const uint8_t *ref8,
-                                           int ref_stride, const uint8_t *mask,
-                                           int mask_stride, int invert_mask) {
+void aom_highbd_comp_mask_upsampled_pred_c(
+    uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
+    int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
+    const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
   int i, j;
-  int stride = ref_stride << 3;
 
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
+                            ref8, ref_stride, bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j << 3], pred[j]);
+        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
       else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j << 3]);
+        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
     }
     comp_pred += width;
     pred += width;
-    ref += stride;
     mask += mask_stride;
   }
 }
@@ -884,6 +979,13 @@ HIGHBD_MASK_SUBPIX_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION_TYPES
+HIGHBD_MASK_SUBPIX_VAR(4, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 8)
+#endif
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_AV1 && CONFIG_EXT_INTER
 
@@ -983,6 +1085,17 @@ OBMC_VAR(128, 128)
 OBMC_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 
+#if CONFIG_EXT_PARTITION_TYPES
+OBMC_VAR(4, 16)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_VAR(16, 4)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_VAR(8, 32)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_VAR(32, 8)
+OBMC_SUBPIX_VAR(32, 8)
+#endif
+
 #if CONFIG_HIGHBITDEPTH
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
@@ -1164,5 +1277,16 @@ HIGHBD_OBMC_SUBPIX_VAR(128, 64)
 HIGHBD_OBMC_VAR(128, 128)
 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION_TYPES
+HIGHBD_OBMC_VAR(4, 16)
+HIGHBD_OBMC_SUBPIX_VAR(4, 16)
+HIGHBD_OBMC_VAR(16, 4)
+HIGHBD_OBMC_SUBPIX_VAR(16, 4)
+HIGHBD_OBMC_VAR(8, 32)
+HIGHBD_OBMC_SUBPIX_VAR(8, 32)
+HIGHBD_OBMC_VAR(32, 8)
+HIGHBD_OBMC_SUBPIX_VAR(32, 8)
+#endif
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_AV1 && CONFIG_MOTION_VAR