1 files changed, 146 insertions, 318 deletions
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 75ae08723..34374af69 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -912,8 +912,8 @@ static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
       in[0] = j;
       in[1] = i;
       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
       if (conv_params->do_average)
         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
             pred[(j - p_col) + (i - p_row) * p_stride] +
@@ -939,136 +939,51 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
                               int16_t beta, int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
   int i, j, k, l, m;
-
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      sx4 += alpha * (-4) + beta * (-4);
-      sy4 += gamma * (-4) + delta * (-4);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      for (k = -7; k < 8; ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
-
-        int sx = sx4 + beta * (k + 4);
-        for (l = -4; l < 4; ++l) {
-          int ix = ix4 + l - 3;
-          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
-
-          int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
-          for (m = 0; m < 8; ++m) {
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
-            sum += ref[iy * stride + sample_x] * coeffs[m];
-          }
-          sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
-          assert(0 <= sum &&
-                 sum < (1 << (bd + WARPEDPIXEL_FILTER_BITS + 1 -
-                              HORSHEAR_REDUCE_PREC_BITS)));
-          tmp[(k + 7) * 8 + (l + 4)] = sum;
-          sx += alpha;
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < 4; ++l) {
-          uint16_t *p =
-              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
-
-          int32_t sum = 1 << (bd + 2 * WARPEDPIXEL_FILTER_BITS -
-                              HORSHEAR_REDUCE_PREC_BITS);
-          for (m = 0; m < 8; ++m) {
-            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
-          }
-          sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
-          assert(0 <= sum && sum < (1 << (bd + 2)));
-          uint16_t px =
-              clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
-          if (conv_params->do_average)
-            *p = ROUND_POWER_OF_TWO(*p + px, 1);
-          else
-            *p = px;
-          sy += gamma;
-        }
-      }
-    }
-  }
-}
-
 #if CONFIG_CONVOLVE_ROUND
-void av1_highbd_warp_affine_post_round_c(
-    const int32_t *mat, const uint16_t *ref, int width, int height, int stride,
-    uint16_t *pred, int p_col, int p_row, int p_width, int p_height,
-    int p_stride, int subsampling_x, int subsampling_y, int bd,
-    ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
-    int16_t delta) {
-  (void)pred;
-  (void)p_stride;
-  int32_t tmp[15 * 8];
-  int i, j, k, l, m;
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int reduce_bits_horiz =
+      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int max_bits_horiz =
+      use_conv_params
+          ? bd + FILTER_BITS + 1 - conv_params->round_0
+          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz =
+      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
+  const int offset_bits_vert =
+      use_conv_params
+          ? bd + 2 * FILTER_BITS - conv_params->round_0
+          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+  if (use_conv_params) {
+    conv_params->do_post_rounding = 1;
+  }
   assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+#else
+  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
+  const int max_bits_horiz =
+      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
+  const int offset_bits_vert =
+      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+#endif
+  (void)max_bits_horiz;
 
   for (i = p_row; i < p_row + p_height; i += 8) {
     for (j = p_col; j < p_col + p_width; j += 8) {
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
       sy4 += gamma * (-4) + delta * (-4);
@@ -1101,9 +1016,8 @@ void av1_highbd_warp_affine_post_round_c(
               sample_x = width - 1;
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
-          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
-          assert(0 <= sum &&
-                 sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0)));
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
           tmp[(k + 7) * 8 + (l + 4)] = sum;
           sx += alpha;
         }
@@ -1112,7 +1026,7 @@ void av1_highbd_warp_affine_post_round_c(
       // Vertical filter
       for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
         int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < 4; ++l) {
+        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
@@ -1122,22 +1036,41 @@ void av1_highbd_warp_affine_post_round_c(
           for (m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-
-          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 -
-                       conv_params->round_1)) -
-                (1 << (offset_bits_vert - conv_params->round_1));
-          CONV_BUF_TYPE *p =
-              &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride +
-                                (j - p_col + l + 4)];
-          *p += sum;
+#if CONFIG_CONVOLVE_ROUND
+          if (use_conv_params) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                  (1 << (offset_bits_horiz + FILTER_BITS -
+                         conv_params->round_0 - conv_params->round_1)) -
+                  (1 << (offset_bits_vert - conv_params->round_1));
+            if (conv_params->do_average)
+              *p += sum;
+            else
+              *p = sum;
+          } else {
+#else
+          {
+#endif
+            uint16_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            uint16_t px =
+                clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
+            if (conv_params->do_average)
+              *p = ROUND_POWER_OF_TWO(*p + px, 1);
+            else
+              *p = px;
+          }
           sy += gamma;
         }
       }
     }
   }
 }
-#endif
 
 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
                               int width, int height, int stride,
@@ -1160,25 +1093,10 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
 
     const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
     uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-#if CONFIG_CONVOLVE_ROUND
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-      conv_params->do_post_rounding = 1;
-      av1_highbd_warp_affine_post_round(
-          mat, ref, width, height, stride, pred, p_col, p_row, p_width,
-          p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params,
-          alpha, beta, gamma, delta);
-    } else {
-      av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col,
-                             p_row, p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, bd, conv_params, alpha, beta, gamma,
-                             delta);
-    }
-#else
     av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                            p_width, p_height, p_stride, subsampling_x,
                            subsampling_y, bd, conv_params, alpha, beta, gamma,
                            delta);
-#endif
   } else {
     highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
                           p_width, p_height, p_stride, subsampling_x,
@@ -1251,8 +1169,8 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
       in[0] = j;
       in[1] = i;
       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
       if (conv_params->do_average)
         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
             pred[(j - p_col) + (i - p_row) * p_stride] +
@@ -1359,143 +1277,51 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
   int32_t tmp[15 * 8];
   int i, j, k, l, m;
   const int bd = 8;
-
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      sx4 += alpha * (-4) + beta * (-4);
-      sy4 += gamma * (-4) + delta * (-4);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      for (k = -7; k < 8; ++k) {
-        // Clamp to top/bottom edge of the frame
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
-
-        int sx = sx4 + beta * (k + 4);
-
-        for (l = -4; l < 4; ++l) {
-          int ix = ix4 + l - 3;
-          // At this point, sx = sx4 + alpha * l + beta * k
-          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
-
-          int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
-          for (m = 0; m < 8; ++m) {
-            // Clamp to left/right edge of the frame
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
-
-            sum += ref[iy * stride + sample_x] * coeffs[m];
-          }
-          sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
-          assert(0 <= sum &&
-                 sum < (1 << (bd + WARPEDPIXEL_FILTER_BITS + 1 -
-                              HORSHEAR_REDUCE_PREC_BITS)));
-          tmp[(k + 7) * 8 + (l + 4)] = sum;
-          sx += alpha;
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
-          uint8_t *p =
-              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-          // At this point, sy = sy4 + gamma * l + delta * k
-          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
-
-          int32_t sum = 1 << (bd + 2 * WARPEDPIXEL_FILTER_BITS -
-                              HORSHEAR_REDUCE_PREC_BITS);
-          for (m = 0; m < 8; ++m) {
-            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
-          }
-          sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
-          assert(0 <= sum && sum < (1 << (bd + 2)));
-          uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
-          if (conv_params->do_average)
-            *p = ROUND_POWER_OF_TWO(*p + px, 1);
-          else
-            *p = px;
-          sy += gamma;
-        }
-      }
-    }
-  }
-}
-
 #if CONFIG_CONVOLVE_ROUND
-void av1_warp_affine_post_round_c(const int32_t *mat, const uint8_t *ref,
-                                  int width, int height, int stride,
-                                  uint8_t *pred, int p_col, int p_row,
-                                  int p_width, int p_height, int p_stride,
-                                  int subsampling_x, int subsampling_y,
-                                  ConvolveParams *conv_params, int16_t alpha,
-                                  int16_t beta, int16_t gamma, int16_t delta) {
-  (void)pred;
-  (void)p_stride;
-  int32_t tmp[15 * 8];
-  int i, j, k, l, m;
-  const int bd = 8;
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int reduce_bits_horiz =
+      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int max_bits_horiz =
+      use_conv_params
+          ? bd + FILTER_BITS + 1 - conv_params->round_0
+          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz =
+      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
+  const int offset_bits_vert =
+      use_conv_params
+          ? bd + 2 * FILTER_BITS - conv_params->round_0
+          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+  if (use_conv_params) {
+    conv_params->do_post_rounding = 1;
+  }
   assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+#else
+  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
+  const int max_bits_horiz =
+      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
+  const int offset_bits_vert =
+      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+#endif
+  (void)max_bits_horiz;
 
   for (i = p_row; i < p_row + p_height; i += 8) {
     for (j = p_col; j < p_col + p_width; j += 8) {
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-      if (subsampling_y)
-        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
-              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
-             4;
-      else
-        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
       sy4 += gamma * (-4) + delta * (-4);
@@ -1533,9 +1359,8 @@ void av1_warp_affine_post_round_c(const int32_t *mat, const uint8_t *ref,
 
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
-          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
-          assert(0 <= sum &&
-                 sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0)));
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
           tmp[(k + 7) * 8 + (l + 4)] = sum;
           sx += alpha;
         }
@@ -1552,26 +1377,43 @@ void av1_warp_affine_post_round_c(const int32_t *mat, const uint8_t *ref,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
-
           for (m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-
-          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 -
-                       conv_params->round_1)) -
-                (1 << (offset_bits_vert - conv_params->round_1));
-          CONV_BUF_TYPE *p =
-              &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride +
-                                (j - p_col + l + 4)];
-          *p += sum;
+#if CONFIG_CONVOLVE_ROUND
+          if (use_conv_params) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                  (1 << (offset_bits_horiz + FILTER_BITS -
+                         conv_params->round_0 - conv_params->round_1)) -
+                  (1 << (offset_bits_vert - conv_params->round_1));
+            if (conv_params->do_average)
+              *p += sum;
+            else
+              *p = sum;
+          } else {
+#else
+          {
+#endif
+            uint8_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
+            if (conv_params->do_average)
+              *p = ROUND_POWER_OF_TWO(*p + px, 1);
+            else
+              *p = px;
+          }
           sy += gamma;
         }
       }
     }
   }
 }
-#endif  // CONFIG_CONVOLVE_ROUND
 
 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
                        int width, int height, int stride, uint8_t *pred,
@@ -1590,23 +1432,9 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
     const int16_t gamma = wm->gamma;
     const int16_t delta = wm->delta;
 
-#if CONFIG_CONVOLVE_ROUND
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-      conv_params->do_post_rounding = 1;
-      av1_warp_affine_post_round(mat, ref, width, height, stride, pred, p_col,
-                                 p_row, p_width, p_height, p_stride,
-                                 subsampling_x, subsampling_y, conv_params,
-                                 alpha, beta, gamma, delta);
-    } else {
-      av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
-                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      conv_params, alpha, beta, gamma, delta);
-    }
-#else
     av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                     p_width, p_height, p_stride, subsampling_x, subsampling_y,
                     conv_params, alpha, beta, gamma, delta);
-#endif
   } else {
     warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,