summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-19 23:00:02 -0500
committertrav90 <travawine@palemoon.org>2018-10-19 23:00:02 -0500
commitb8df135c97a854c2ff9b4394b016649c601177fa (patch)
tree802b7de5ad245f1a12adbcef835ab0d0687c1bf8 /third_party/aom/av1/common
parenta4d3c59dcac642f6b9557dc09b60eda40b517630 (diff)
downloadUXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar
UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.gz
UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.lz
UXP-b8df135c97a854c2ff9b4394b016649c601177fa.tar.xz
UXP-b8df135c97a854c2ff9b4394b016649c601177fa.zip
Update libaom to rev b25610052a1398032320008d69b51d2da94f5928
Diffstat (limited to 'third_party/aom/av1/common')
-rw-r--r--third_party/aom/av1/common/alloccommon.c4
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c844
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h152
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c24
-rw-r--r--third_party/aom/av1/common/arm/intrapred_neon.c79
-rw-r--r--third_party/aom/av1/common/arm/jnt_convolve_neon.c24
-rw-r--r--third_party/aom/av1/common/arm/mem_neon.h84
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c1506
-rw-r--r--third_party/aom/av1/common/arm/transpose_neon.h38
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c51
-rw-r--r--third_party/aom/av1/common/av1_rtcd.c6
-rwxr-xr-xthird_party/aom/av1/common/av1_rtcd_defs.pl52
-rw-r--r--third_party/aom/av1/common/av1_txfm.h47
-rw-r--r--third_party/aom/av1/common/blockd.h47
-rw-r--r--third_party/aom/av1/common/cdef.c6
-rw-r--r--third_party/aom/av1/common/cfl.c15
-rw-r--r--third_party/aom/av1/common/convolve.c154
-rw-r--r--third_party/aom/av1/common/convolve.h18
-rw-r--r--third_party/aom/av1/common/enums.h2
-rw-r--r--third_party/aom/av1/common/filter.c120
-rw-r--r--third_party/aom/av1/common/filter.h116
-rw-r--r--third_party/aom/av1/common/mv.h3
-rw-r--r--third_party/aom/av1/common/mvref_common.h2
-rw-r--r--third_party/aom/av1/common/onyxc_int.h76
-rw-r--r--third_party/aom/av1/common/quant_common.c23
-rw-r--r--third_party/aom/av1/common/quant_common.h1
-rw-r--r--third_party/aom/av1/common/reconinter.c65
-rw-r--r--third_party/aom/av1/common/reconinter.h10
-rw-r--r--third_party/aom/av1/common/reconintra.c19
-rw-r--r--third_party/aom/av1/common/reconintra.h10
-rw-r--r--third_party/aom/av1/common/resize.c41
-rw-r--r--third_party/aom/av1/common/restoration.c40
-rw-r--r--third_party/aom/av1/common/restoration.h2
-rw-r--r--third_party/aom/av1/common/scan.h7
-rw-r--r--third_party/aom/av1/common/thread_common.c2
-rw-r--r--third_party/aom/av1/common/tile_common.c4
-rw-r--r--third_party/aom/av1/common/timing.c4
-rw-r--r--third_party/aom/av1/common/timing.h10
-rw-r--r--third_party/aom/av1/common/txb_common.h25
-rw-r--r--third_party/aom/av1/common/warped_motion.c27
-rw-r--r--third_party/aom/av1/common/warped_motion.h5
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c16
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c1058
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h141
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c11
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h11
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c16
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c10
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c8
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c4
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c12
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c12
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c40
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c8
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c32
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c8
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c8
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c11
60 files changed, 3708 insertions, 1495 deletions
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
index 49902cc7d..1bf81c91d 100644
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -137,11 +137,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
// Now we need to allocate enough space to store the line buffers for the
// stripes
const int frame_w = cm->superres_upscaled_width;
- const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
for (int p = 0; p < num_planes; ++p) {
const int is_uv = p > 0;
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 000000000..51c991498
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+
+static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
+ const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+ TxSetType tx_set_type;
+ if (tx_size_sqr_up > TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCTONLY;
+ } else if (tx_size_sqr_up == TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCT_IDTX;
+ } else {
+ tx_set_type = EXT_TX_SET_ALL16;
+ }
+ return tx_set_type;
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+ { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+ { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+ { av1_idct32_new, NULL, NULL },
+ { av1_idct64_new, NULL, NULL },
+};
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { av1_idct4_new, av1_idct4_new, NULL, NULL },
+ { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+ { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+ },
+ { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+ { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+ { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+ {
+ { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+ { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+ { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+ },
+ { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+ { NULL, NULL, NULL, NULL },
+ { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+ av1_iidentity32_c } },
+ { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby, ud_flip, lr_flip, row_start;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int bd = 8;
+ int r;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ row_start = (buf_size_nonzero_h_div8 << 3);
+
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ int row;
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X64: {
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X16: {
+ int32_t mod_input[64 * 16];
+ for (row = 0; row < 16; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_32X64: {
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X32: {
+ int32_t mod_input[64 * 32];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X64: {
+ int32_t mod_input[64 * 64];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ default:
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 000000000..6af2d61e7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+ const int8_t cos_bit,
+ const int8_t *stage_ptr);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index 86a25e109..f15744c94 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -164,8 +164,8 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(
void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
@@ -182,7 +182,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -485,8 +485,8 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int vert_offset = filter_params_y->taps / 2 - 1;
@@ -502,7 +502,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
if (w <= 4) {
uint8x8_t d01, d23;
@@ -680,8 +680,8 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int im_dst_stride;
@@ -711,7 +711,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -896,7 +896,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1086,8 +1086,8 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c
deleted file mode 100644
index 799355553..000000000
--- a/third_party/aom/av1/common/arm/intrapred_neon.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
- int bw, const uint16_t *above,
- const uint16_t *left) {
- assert(bw >= 4);
- assert(IS_POWER_OF_TWO(bw));
- int expected_dc, sum = 0;
- const int count = bw * 2;
- uint32x4_t sum_q = vdupq_n_u32(0);
- uint32x2_t sum_d;
- uint16_t *dst_1;
- if (bw >= 8) {
- for (int i = 0; i < bw; i += 8) {
- sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
- sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
- above += 8;
- left += 8;
- }
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- dst_1 = dst;
- for (int i = 0; i < bw; i += 8) {
- vst1q_u16(dst_1, dc);
- dst_1 += 8;
- }
- dst += stride;
- }
- } else { // 4x4
- sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- vst1_u16(dst, dc);
- dst += stride;
- }
- }
-}
-
-#define intra_pred_highbd_sized(type, width) \
- void aom_highbd_##type##_predictor_##width##x##width##_neon( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
- (void)bd; \
- highbd_##type##_predictor_neon(dst, stride, width, above, left); \
- }
-
-#define intra_pred_square(type) \
- intra_pred_highbd_sized(type, 4); \
- intra_pred_highbd_sized(type, 8); \
- intra_pred_highbd_sized(type, 16); \
- intra_pred_highbd_sized(type, 32); \
- intra_pred_highbd_sized(type, 64);
-
-intra_pred_square(dc);
-
-#undef intra_pred_square
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 992be4a9e..4015082b4 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -515,8 +515,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -532,9 +532,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
const int round_0 = conv_params->round_0 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -553,8 +553,8 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
@@ -679,8 +679,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -705,7 +705,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const uint8_t *src_ptr = src - horiz_offset;
@@ -1013,8 +1013,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
assert(!(w % 4));
@@ -1040,7 +1040,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const uint8_t *src_ptr = src - (vert_offset * src_stride);
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 214b14bf7..4bf45a52c 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -22,6 +22,14 @@ static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
s += p;
}
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define load_u8_4x1(s, s0, lane) \
+ do { \
+ *(s0) = vreinterpret_u8_u32( \
+ vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+ } while (0)
+
static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2, uint8x8_t *const s3,
@@ -128,6 +136,13 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
*s3 = vld1_s16(s);
}
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define store_u8_4x1(s, s0, lane) \
+ do { \
+ vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+ } while (0)
+
static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
@@ -242,6 +257,30 @@ static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
vst1q_s16(s, s7);
}
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+}
+
static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1,
int16x8_t *const s2, int16x8_t *const s3,
@@ -398,4 +437,49 @@ static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
*tu1 = vsetq_lane_u64(a, *tu1, 1);
}
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+ int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+ *s1 = vld1q_s32(s);
+ s += p;
+ *s2 = vld1q_s32(s);
+ s += p;
+ *s3 = vld1q_s32(s);
+ s += p;
+ *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+ int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+ vst1q_s32(s, s1);
+ s += p;
+ vst1q_s32(s, s2);
+ s += p;
+ vst1q_s32(s, s3);
+ s += p;
+ vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+ uint32x4_t *s2, uint32x4_t *s3,
+ uint32x4_t *s4) {
+ *s1 = vld1q_u32(s);
+ s += p;
+ *s2 = vld1q_u32(s);
+ s += p;
+ *s3 = vld1q_u32(s);
+ s += p;
+ *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+ uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+ vst1q_u32(s, s1);
+ s += p;
+ vst1q_u32(s, s2);
+ s += p;
+ vst1q_u32(s, s3);
+ s += p;
+ vst1q_u32(s, s4);
+}
+
#endif // AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 000000000..b4808a972
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+ int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+ uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+ uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+ const int buf_stride) {
+ uint32x4_t q0, q1, q2, q3;
+ uint32x4_t p0, p1, p2, p3;
+ uint16x4_t d0, d1, d2, d3;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+
+ q0 = vmulq_u32(s4, s4);
+ q1 = vmulq_u32(s5, s5);
+ q2 = vmulq_u32(s6, s6);
+ q3 = vmulq_u32(s7, s7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 4; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+ }
+ p0 = vsubl_u16(sgrproj_sgr, d0);
+ p1 = vsubl_u16(sgrproj_sgr, d1);
+ p2 = vsubl_u16(sgrproj_sgr, d2);
+ p3 = vsubl_u16(sgrproj_sgr, d3);
+
+ s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+ s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+ s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+ s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+ s4 = vmulq_u32(s4, p0);
+ s5 = vmulq_u32(s5, p1);
+ s6 = vmulq_u32(s6, p2);
+ s7 = vmulq_u32(s7, p3);
+
+ p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+ uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+ uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+ uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+ uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+ uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+ uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+ s4 = vmulq_u32(s4, const_n_val);
+ s5 = vmulq_u32(s5, const_n_val);
+ s6 = vmulq_u32(s6, const_n_val);
+ s7 = vmulq_u32(s7, const_n_val);
+
+ d0 = vget_low_u16(s16_4);
+ d1 = vget_low_u16(s16_5);
+ d2 = vget_low_u16(s16_6);
+ d3 = vget_low_u16(s16_7);
+ d4 = vget_high_u16(s16_4);
+ d5 = vget_high_u16(s16_5);
+ d6 = vget_high_u16(s16_6);
+ d7 = vget_high_u16(s16_7);
+
+ q0 = vmull_u16(d0, d0);
+ q1 = vmull_u16(d1, d1);
+ q2 = vmull_u16(d2, d2);
+ q3 = vmull_u16(d3, d3);
+ q4 = vmull_u16(d4, d4);
+ q5 = vmull_u16(d5, d5);
+ q6 = vmull_u16(d6, d6);
+ q7 = vmull_u16(d7, d7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+ p4 = vcleq_u32(q4, s4);
+ p5 = vcleq_u32(q5, s5);
+ p6 = vcleq_u32(q6, s6);
+ p7 = vcleq_u32(q7, s7);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+ q4 = vsubq_u32(s4, q4);
+ q5 = vsubq_u32(s5, q5);
+ q6 = vsubq_u32(s6, q6);
+ q7 = vsubq_u32(s7, q7);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+ p4 = vandq_u32(p4, q4);
+ p5 = vandq_u32(p5, q5);
+ p6 = vandq_u32(p6, q6);
+ p7 = vandq_u32(p7, q7);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+ p4 = vmulq_u32(p4, s_vec);
+ p5 = vmulq_u32(p5, s_vec);
+ p6 = vmulq_u32(p6, s_vec);
+ p7 = vmulq_u32(p7, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+ p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+ p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+ p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+ p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+ p4 = vminq_u32(p4, const_val);
+ p5 = vminq_u32(p5, const_val);
+ p6 = vminq_u32(p6, const_val);
+ p7 = vminq_u32(p7, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+ store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 8; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+ }
+
+ s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+ s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+ s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+ s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+ s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+ s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+ s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+ s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+ s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+ s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+ s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+ s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+ s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+ s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+ s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+ s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+ s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+ s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+ s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+ s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+ p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+ p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+ store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+ vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+ vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+ int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+ int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+ int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+ int32x4_t r12, r34, r67, r89, r1011;
+ int32x4_t r345, r6789, r789;
+
+ d1 = vmull_s16(t1, t1);
+ d2 = vmull_s16(t2, t2);
+ d3 = vmull_s16(t3, t3);
+ d4 = vmull_s16(t4, t4);
+ d5 = vmull_s16(t5, t5);
+ d6 = vmull_s16(t6, t6);
+ d7 = vmull_s16(t7, t7);
+ d8 = vmull_s16(t8, t8);
+ d9 = vmull_s16(t9, t9);
+ d10 = vmull_s16(t10, t10);
+ d11 = vmull_s16(t11, t11);
+
+ r12 = vaddq_s32(d1, d2);
+ r34 = vaddq_s32(d3, d4);
+ r67 = vaddq_s32(d6, d7);
+ r89 = vaddq_s32(d8, d9);
+ r1011 = vaddq_s32(d10, d11);
+ r345 = vaddq_s32(r34, d5);
+ r6789 = vaddq_s32(r67, r89);
+ r789 = vsubq_s32(r6789, d6);
+ *r0 = vaddq_s32(r12, r345);
+ *r1 = vaddq_s32(r67, r345);
+ *r2 = vaddq_s32(d5, r6789);
+ *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+ int32_t *dst32, int32_t *dst2, const int dst_stride,
+ const int width, const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *dst1_16_ptr, *src_ptr;
+ int32_t *dst2_ptr;
+ int h, w, count = 0;
+ const int dst_stride_2 = (dst_stride << 1);
+ const int dst_stride_8 = (dst_stride << 3);
+
+ dst1_16_ptr = dst16;
+ dst2_ptr = dst2;
+ src_ptr = src;
+ w = width;
+ {
+ int16x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t t8, t9, t10, t11, t12;
+
+ int16x8_t q12345, q56789, q34567, q7891011;
+ int16x8_t q12, q34, q67, q89, q1011;
+ int16x8_t q345, q6789, q789;
+
+ int32x4_t r12345, r56789, r34567, r7891011;
+
+ do {
+ h = height;
+ dst1_16_ptr = dst16 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+
+ dst1_16_ptr += dst_stride_2;
+ dst2_ptr += dst_stride_2;
+ do {
+ load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+ q12 = vaddq_s16(t1, t2);
+ q34 = vaddq_s16(t3, t4);
+ q67 = vaddq_s16(t6, t7);
+ q89 = vaddq_s16(t8, t9);
+ q1011 = vaddq_s16(t10, t11);
+ q345 = vaddq_s16(q34, t5);
+ q6789 = vaddq_s16(q67, q89);
+ q789 = vaddq_s16(q89, t7);
+ q12345 = vaddq_s16(q12, q345);
+ q34567 = vaddq_s16(q67, q345);
+ q56789 = vaddq_s16(t5, q6789);
+ q7891011 = vaddq_s16(q789, q1011);
+
+ store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+ q7891011);
+ dst1_16_ptr += dst_stride_8;
+
+ boxsum2_square_sum_calc(
+ vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+ vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+ vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+ vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+ boxsum2_square_sum_calc(
+ vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+ vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+ vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+ vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+ r7891011);
+ dst2_ptr += (dst_stride_8);
+ h -= 8;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int32x4_t q12345, q34567, q23456, q45678;
+ int32x4_t q23, q45, q67;
+ int32x4_t q2345, q4567;
+
+ int32x4_t r12345, r34567, r23456, r45678;
+ int32x4_t r23, r45, r67;
+ int32x4_t r2345, r4567;
+
+ int32_t *src2_ptr, *dst1_32_ptr;
+ int16_t *src1_ptr;
+ count = 0;
+ h = height;
+ do {
+ dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+ dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+ src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ w = width;
+
+ dst1_32_ptr += 2;
+ dst2_ptr += 2;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+ transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+ transpose_s32_4x4(&d1, &d2, &d3, &d4);
+ do {
+ src1_ptr += 4;
+ src2_ptr += 4;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+ transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+ transpose_s32_4x4(&d5, &d6, &d7, &d8);
+ q23 = vaddl_s16(s2, s3);
+ q45 = vaddl_s16(s4, s5);
+ q67 = vaddl_s16(s6, s7);
+ q2345 = vaddq_s32(q23, q45);
+ q4567 = vaddq_s32(q45, q67);
+ q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+ q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+ q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+ q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+ transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+ store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+ q45678);
+ dst1_32_ptr += 4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+
+ r23 = vaddq_s32(d2, d3);
+ r45 = vaddq_s32(d4, d5);
+ r67 = vaddq_s32(d6, d7);
+ r2345 = vaddq_s32(r23, r45);
+ r4567 = vaddq_s32(r45, r67);
+ r12345 = vaddq_s32(d1, r2345);
+ r23456 = vaddq_s32(r2345, d6);
+ r34567 = vaddq_s32(r4567, d3);
+ r45678 = vaddq_s32(r4567, d8);
+
+ transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+ dst2_ptr += 4;
+ d1 = d5;
+ d2 = d6;
+ d3 = d7;
+ d4 = d8;
+ w -= 4;
+ } while (w > 0);
+ h -= 8;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+ load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s16_4 = s16_0;
+ s16_5 = s16_1;
+ s16_6 = s16_2;
+ s16_7 = s16_3;
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int bit_depth,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint16x8_t s16_0, s16_1, s16_2, s16_3;
+ uint16x8_t s16_4, s16_5, s16_6, s16_7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+ s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+ s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+ s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+ s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vreinterpretq_u32_s32(sr0);
+ s1 = vreinterpretq_u32_s32(sr1);
+ s2 = vreinterpretq_u32_s32(sr2);
+ s3 = vreinterpretq_u32_s32(sr3);
+ s4 = vreinterpretq_u32_s32(sr4);
+ s5 = vreinterpretq_u32_s32(sr5);
+ s6 = vreinterpretq_u32_s32(sr6);
+ s7 = vreinterpretq_u32_s32(sr7);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int bit_depth, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+ int32_t *dst2, const int dst_stride, const int width,
+ const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *src_ptr;
+ int32_t *dst2_ptr;
+ uint16_t *dst1_ptr;
+ int h, w, count = 0;
+
+ w = width;
+ {
+ int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int16x8_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r345, r456, r567, r78, r678;
+ int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+ int32x4_t r2, r3, r5, r6, r7, r8;
+ int16x8_t q678, q78;
+
+ do {
+ dst1_ptr = dst1 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+ h = height;
+
+ load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ src_ptr += 4 * src_stride;
+
+ q23 = vaddq_s16(s2, s3);
+ q234 = vaddq_s16(q23, s4);
+ q34 = vaddq_s16(s3, s4);
+ dst1_ptr += (dst_stride << 1);
+
+ r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+ r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+ r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_low = vaddq_s32(r23, r4_low);
+ r34_low = vaddq_s32(r3, r4_low);
+
+ r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+ r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+ r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_high = vaddq_s32(r23, r4_high);
+ r34_high = vaddq_s32(r3, r4_high);
+
+ dst2_ptr += (dst_stride << 1);
+
+ do {
+ load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+ src_ptr += 4 * src_stride;
+
+ q345 = vaddq_s16(s5, q34);
+ q56 = vaddq_s16(s5, s6);
+ q456 = vaddq_s16(s4, q56);
+ q567 = vaddq_s16(s7, q56);
+ q78 = vaddq_s16(s7, s8);
+ q678 = vaddq_s16(s6, q78);
+
+ store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += (dst_stride << 2);
+
+ s4 = s8;
+ q34 = q78;
+ q234 = q678;
+
+ r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+ r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+ r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+ r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_low);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_low, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+ r4_low = r8;
+ r34_low = r78;
+ r234_low = r678;
+
+ r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+ r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+ r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+ r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_high);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_high, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+ dst2_ptr += (dst_stride << 2);
+
+ r4_high = r8;
+ r34_high = r78;
+ r234_high = r678;
+
+ h -= 4;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int16x4_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+ int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+ int16x4_t q678, q78;
+
+ int32_t *src2_ptr;
+ uint16_t *src1_ptr;
+ count = 0;
+ h = height;
+ w = width;
+ do {
+ dst1_ptr = dst1 + (count << 2) * dst_stride;
+ dst2_ptr = dst2 + (count << 2) * dst_stride;
+ src1_ptr = dst1 + (count << 2) * dst_stride;
+ src2_ptr = dst2 + (count << 2) * dst_stride;
+ w = width;
+
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+ transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+ load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+ transpose_s32_4x4(&r1, &r2, &r3, &r4);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q23 = vadd_s16(d2, d3);
+ q234 = vadd_s16(q23, d4);
+ q34 = vadd_s16(d3, d4);
+ dst1_ptr += 2;
+ r23 = vaddq_s32(r2, r3);
+ r234 = vaddq_s32(r23, r4);
+ r34 = vaddq_s32(r3, r4);
+ dst2_ptr += 2;
+
+ do {
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+ transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+ load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+ transpose_s32_4x4(&r5, &r6, &r7, &r8);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q345 = vadd_s16(d5, q34);
+ q56 = vadd_s16(d5, d6);
+ q456 = vadd_s16(d4, q56);
+ q567 = vadd_s16(d7, q56);
+ q78 = vadd_s16(d7, d8);
+ q678 = vadd_s16(d6, q78);
+ transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+ store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += 4;
+
+ d4 = d8;
+ q34 = q78;
+ q234 = q678;
+
+ r345 = vaddq_s32(r5, r34);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ transpose_s32_4x4(&r234, &r345, &r456, &r567);
+ store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+ dst2_ptr += 4;
+
+ r4 = r8;
+ r34 = r78;
+ r234 = r678;
+ w -= 4;
+ } while (w > 0);
+ h -= 4;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ int32x4_t fours, threes, res;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+ threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+ return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ uint16x8_t r0, r1;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xb = vaddq_u16(xb, x);
+ xt = vaddq_u16(xt, xr);
+ xl = vaddq_u16(xl, xb);
+ xl = vaddq_u16(xl, xt);
+
+ r0 = vshlq_n_u16(xl, 2);
+
+ xbl = vaddq_u16(xbl, xbr);
+ xtl = vaddq_u16(xtl, xtr);
+ xtl = vaddq_u16(xtl, xbl);
+
+ r1 = vshlq_n_u16(xtl, 2);
+ r1 = vsubq_u16(r1, xtl);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ sixes = vaddq_s32(xt, xb);
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xbr = vaddq_u16(xbr, xbl);
+ xtr = vaddq_u16(xtr, xtl);
+ xbr = vaddq_u16(xbr, xtr);
+ xtl = vshlq_n_u16(xbr, 2);
+ xbr = vaddq_u16(xtl, xbr);
+
+ xb = vaddq_u16(xb, xt);
+ xb0 = vshlq_n_u16(xb, 1);
+ xb = vshlq_n_u16(xb, 2);
+ xb = vaddq_u16(xb, xb0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+ int32x4_t xl, x, xr;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ fives = vaddq_s32(xl, xr);
+ sixes = x;
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+ int32x4_t *a1) {
+ uint16x8_t xl, x, xr;
+ uint16x8_t x0;
+
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xl = vaddq_u16(xl, xr);
+ x0 = vshlq_n_u16(xl, 2);
+ xl = vaddq_u16(xl, x0);
+
+ x0 = vshlq_n_u16(x, 1);
+ x = vshlq_n_u16(x, 2);
+ x = vaddq_u16(x, x0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride,
+ int32_t *dst, const int dst_stride,
+ const int width, const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+
+ A_tmp = A;
+ B_tmp = B;
+ src_ptr = src;
+ dst_ptr = dst;
+ h = height;
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ if (!(count & 1)) {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+ b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ } else {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_odd_row(B_tmp);
+ b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+ h = height;
+
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+ b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+ int height, int dgd_stride,
+ int32_t *dst, int dst_stride,
+ int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ const int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ int32_t *sum_buf = B_;
+ uint16_t *tmp16_buf = A16_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 0);
+ assert(r == 2);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+ // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+ // buffer(square_sum_buf).
+ boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+ width_ext, height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+ if (8 == bit_depth) {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_hbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+ bit_depth, r, params->s[radius_idx], 2);
+ }
+ final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+ dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ uint16_t *sum_buf = B16_;
+ uint16_t *A16 = A16_;
+ int32_t *B = B_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 1);
+ assert(r == 1);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels output will be in 16bit(sum_buf).
+ // sum of squares output is kept in 32bit buffer(square_sum_buf).
+ boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+ height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+ if (8 == bit_depth) {
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, r, params->s[radius_idx], 1);
+ } else {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
+ }
+ final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+ dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+ const int src_stride, uint16_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ const uint8_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+
+ uint8x8_t t1, t2, t3, t4;
+ uint16x8_t s1, s2, s3, s4;
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ if (w >= 7) {
+ do {
+ load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ s1 = vmovl_u8(t1);
+ s2 = vmovl_u8(t2);
+ s3 = vmovl_u8(t3);
+ s4 = vmovl_u8(t4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+ }
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ for (int x = 0; x < h; x++) {
+ for (int y = 0; y < width; y++) {
+ dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+ }
+ }
+}
+
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+ uint16_t *dst, const int dst_stride,
+ int width, int height) {
+ const uint16_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+ uint16x8_t s1, s2, s3, s4;
+
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ do {
+ load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+
+ for (int x = 0; x < h; x++) {
+ memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+ sizeof(uint16_t) * width);
+ }
+}
+
+void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+ flt_stride, bit_depth, sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+ bit_depth, sgr_params_idx, 1);
+}
+
+void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+ bit_depth, eps, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+ bit_depth, eps, 1);
+
+ decode_xq(xqd, xq, params);
+
+ {
+ int16_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint16_t *dst16_ptr;
+ int16x4_t d0, d4;
+ int16x8_t r0, s0;
+ uint16x8_t r4;
+ int32x4_t u0, u4, v0, v4, f00, f10;
+ uint8x8_t t0;
+ int count = 0, w = width, h = height, rc = 0;
+
+ const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+ const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+ const int16x8_t zero = vdupq_n_s16(0);
+ const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+ dst_ptr = dst8;
+ src_ptr = (int16_t *)dgd16;
+ do {
+ w = width;
+ count = 0;
+ dst_ptr = dst8 + rc * dst_stride;
+ dst16_ptr = dst16 + rc * dst_stride;
+ do {
+ s0 = vld1q_s16(src_ptr + count);
+
+ u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+ u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+ v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+ v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ f00 = vld1q_s32(flt0 + count);
+ f10 = vld1q_s32(flt0 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq0_vec, f00);
+ v4 = vmlaq_s32(v4, xq0_vec, f10);
+ }
+
+ if (params->r[1] > 0) {
+ f00 = vld1q_s32(flt1 + count);
+ f10 = vld1q_s32(flt1 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq1_vec, f00);
+ v4 = vmlaq_s32(v4, xq1_vec, f10);
+ }
+
+ d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ r0 = vcombine_s16(d0, d4);
+
+ r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+ if (highbd) {
+ r4 = vminq_u16(r4, max);
+ vst1q_u16(dst16_ptr, r4);
+ } else {
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+ }
+ w -= 8;
+ count += 8;
+ dst_ptr += 8;
+ dst16_ptr += 8;
+ } while (w > 0);
+
+ src_ptr += dgd16_stride;
+ flt1 += width;
+ flt0 += width;
+ rc++;
+ h--;
+ } while (h > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index 53727bb43..fe134087b 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -419,4 +419,42 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
*a3 = vreinterpret_s16_s32(c1.val[1]);
}
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+ return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 738290fad..9d68b8760 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -1308,7 +1308,7 @@ static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
end <<= MI_SIZE_LOG2;
uint8_t *ref0 = ref_buf;
uint8_t *dst0 = dst_buf;
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
for (int j = 0; j < 4; ++j) {
@@ -1404,11 +1404,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
- if (cm->use_highbitdepth)
+ if (cm->seq_params.use_highbitdepth)
highbd_filter_selectively_vert_row2(
ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
- &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
else
filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
mask_16x16_0, mask_8x8_0, mask_4x4_0,
@@ -1474,10 +1474,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
- if (cm->use_highbitdepth)
- highbd_filter_selectively_horiz(
- CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl,
+ (int)cm->seq_params.bit_depth);
else
filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
mask_8x8, mask_4x4, &cm->lf_info, lfl);
@@ -1652,6 +1653,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
for (int y = 0; y < y_range; y += row_step) {
uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
for (int x = 0; x < x_range;) {
@@ -1677,40 +1680,40 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
case 6: // apply 6-tap filter for chroma plane only
assert(plane != 0);
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 14-tap filtering
case 14:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
- cm->bit_depth);
+ bit_depth);
else
aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
@@ -1737,6 +1740,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
for (int x = 0; x < x_range; x += col_step) {
uint8_t *p = dst_ptr + x * MI_SIZE;
for (int y = 0; y < y_range;) {
@@ -1762,10 +1767,10 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
@@ -1773,30 +1778,30 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
// apply 6-tap filtering
case 6:
assert(plane != 0);
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 14-tap filtering
case 14:
- if (cm->use_highbitdepth)
+ if (use_highbitdepth)
aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
- params.hev_thr, cm->bit_depth);
+ params.hev_thr, bit_depth);
else
aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
index 38e26bee1..a77a4d254 100644
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -16,7 +16,7 @@
#include "aom_ports/aom_once.h"
void av1_rtcd() {
- // TODO(JBB): Remove this once, by insuring that both the encoder and
- // decoder setup functions are protected by once();
- once(setup_rtcd_internal);
+ // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+ // decoder setup functions are protected by aom_once();
+ aom_once(setup_rtcd_internal);
}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 6aa925515..fa8b34981 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -106,7 +106,7 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
#inv txfm
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2/;
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -181,7 +181,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#fwd txfm
add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
- specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+ specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -241,11 +241,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_txb_init_levels sse4_1/;
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
- specialize qw/av1_wedge_sse_from_residuals sse2/;
+ specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
- specialize qw/av1_wedge_sign_from_residuals sse2/;
+ specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
- specialize qw/av1_wedge_compute_delta_squares sse2/;
+ specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
# hash
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
@@ -288,34 +288,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# LOOP_RESTORATION functions
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2/;
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
int sgr_params_idx, int bit_depth, int highbd";
-specialize qw/av1_selfguided_restoration sse4_1 avx2/;
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-
- add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
- add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index 5db3233f5..c9cc79852 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -171,53 +171,6 @@ static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
}
-static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
- switch (tx_size) {
- case TX_4X4: return TX_4X4;
- case TX_8X8: return TX_8X8;
- case TX_16X16: return TX_16X16;
- case TX_32X32: return TX_32X32;
- case TX_64X64: return TX_64X64;
- case TX_32X64: return TX_64X32;
- case TX_64X32: return TX_32X64;
- case TX_4X8: return TX_8X4;
- case TX_8X4: return TX_4X8;
- case TX_8X16: return TX_16X8;
- case TX_16X8: return TX_8X16;
- case TX_16X32: return TX_32X16;
- case TX_32X16: return TX_16X32;
- case TX_4X16: return TX_16X4;
- case TX_16X4: return TX_4X16;
- case TX_8X32: return TX_32X8;
- case TX_32X8: return TX_8X32;
- case TX_16X64: return TX_64X16;
- case TX_64X16: return TX_16X64;
- default: assert(0); return TX_INVALID;
- }
-}
-
-static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
- switch (tx_type) {
- case DCT_DCT: return DCT_DCT;
- case ADST_DCT: return DCT_ADST;
- case DCT_ADST: return ADST_DCT;
- case ADST_ADST: return ADST_ADST;
- case FLIPADST_DCT: return DCT_FLIPADST;
- case DCT_FLIPADST: return FLIPADST_DCT;
- case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
- case ADST_FLIPADST: return FLIPADST_ADST;
- case FLIPADST_ADST: return ADST_FLIPADST;
- case IDTX: return IDTX;
- case V_DCT: return H_DCT;
- case H_DCT: return V_DCT;
- case V_ADST: return H_ADST;
- case H_ADST: return V_ADST;
- case V_FLIPADST: return H_FLIPADST;
- case H_FLIPADST: return V_FLIPADST;
- default: assert(0); return TX_TYPES;
- }
-}
-
// Utility function that returns the log of the ratio of the col and row
// sizes.
static INLINE int get_rect_tx_log_ratio(int col, int row) {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 3e8d1d6c6..979f13bd9 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -605,6 +605,12 @@ static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
}
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+ return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? CONVERT_TO_BYTEPTR(buf16)
+ : buf16;
+}
+
static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_4X4: return 0;
@@ -674,6 +680,15 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
};
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+ 0x0001, // 0000 0000 0000 0001
+ 0x0201, // 0000 0010 0000 0001
+ 0x020F, // 0000 0010 0000 1111
+ 0x0E0F, // 0000 1110 0000 1111
+ 0x0FFF, // 0000 1111 1111 1111
+ 0xFFFF, // 1111 1111 1111 1111
+};
+
static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
int use_reduced_set) {
const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -1145,38 +1160,6 @@ static INLINE PLANE_TYPE get_plane_type(int plane) {
return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
}
-static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
- const uint8_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
- const uint16_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int16(int16_t *dst, int dst_stride,
- const int16_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
-static INLINE void transpose_int32(int32_t *dst, int dst_stride,
- const int32_t *src, int src_stride, int w,
- int h) {
- int r, c;
- for (r = 0; r < h; ++r)
- for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
-}
-
static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
return 1024;
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
index c9b974900..e9e2b0e42 100644
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@@ -110,7 +110,7 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
const uint8_t *src, int src_voffset, int src_hoffset,
int sstride, int vsize, int hsize) {
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
const uint16_t *base =
&CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -153,7 +153,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int mi_high_l2[3];
int xdec[3];
int ydec[3];
- int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+ int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
@@ -363,7 +363,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
}
- if (cm->use_highbitdepth) {
+ if (cm->seq_params.use_highbitdepth) {
cdef_filter_fb(
NULL,
&CONVERT_TO_SHORTPTR(
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
index ee19f0bcf..ccc59b4eb 100644
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@@ -15,21 +15,14 @@
#include "config/av1_rtcd.h"
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
- if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
- !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
- !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
- "CfL, %d %d subsampling is not supported.\n",
- cm->subsampling_x, cm->subsampling_y);
- }
+
memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
- cfl->subsampling_x = cm->subsampling_x;
- cfl->subsampling_y = cm->subsampling_y;
+ cfl->subsampling_x = seq_params->subsampling_x;
+ cfl->subsampling_y = seq_params->subsampling_y;
cfl->are_parameters_computed = 0;
cfl->store_y = 0;
// The DC_PRED cache is disabled by default and is only enabled in
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index d57f44f8b..ed962c722 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -75,8 +75,8 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -91,7 +91,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -107,7 +107,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -126,8 +126,8 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -141,7 +141,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -156,8 +156,8 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -172,7 +172,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -187,8 +187,8 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
@@ -204,8 +204,8 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -222,7 +222,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -238,7 +238,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -270,8 +270,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -289,7 +289,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -320,8 +320,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -339,7 +339,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -370,8 +370,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -412,8 +412,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
@@ -439,7 +439,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(x_filter_idx < SUBPEL_SHIFTS);
const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_x[k - fo_horiz];
@@ -461,7 +461,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -498,8 +498,8 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
static void convolve_2d_scale_wrapper(
const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
if (conv_params->is_compound) {
@@ -520,25 +520,27 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
(void)y_step_q4;
(void)dst;
(void)dst_stride;
-
- InterpFilterParams filter_params_x, filter_params_y;
- av1_get_convolve_filter_params(interp_filters, &filter_params_x,
- &filter_params_y, w, h);
+ InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+ InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter_x, w);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter_y, h);
if (scaled)
convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
- &filter_params_x, &filter_params_y, subpel_x_q4,
+ filter_params_x, filter_params_y, subpel_x_q4,
x_step_q4, subpel_y_q4, y_step_q4, conv_params);
else
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
- src, src_stride, dst, dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
}
void av1_highbd_convolve_2d_copy_sr_c(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
@@ -554,8 +556,8 @@ void av1_highbd_convolve_2d_copy_sr_c(
void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -569,7 +571,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -585,8 +587,8 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -599,7 +601,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -614,8 +616,8 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -630,7 +632,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -646,7 +648,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -666,8 +668,9 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int x, y, k;
@@ -685,7 +688,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -703,7 +706,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
int16_t *src_vert = im_block + fo_vert * im_stride;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = 1 << offset_bits;
@@ -734,8 +737,9 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -753,7 +757,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
assert(bits >= 0);
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -784,8 +788,9 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -803,7 +808,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
assert(bits >= 0);
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -834,8 +839,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
void av1_highbd_jnt_convolve_2d_copy_c(
const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
- int w, int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -875,8 +880,8 @@ void av1_highbd_jnt_convolve_2d_copy_c(
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params, int bd) {
@@ -900,7 +905,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(x_filter_idx < SUBPEL_SHIFTS);
const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_x[k - fo_horiz];
@@ -922,7 +927,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -971,9 +976,12 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
(void)dst_stride;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- InterpFilterParams filter_params_x, filter_params_y;
- av1_get_convolve_filter_params(interp_filters, &filter_params_x,
- &filter_params_y, w, h);
+ InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
+ InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter_x, w);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter_y, h);
if (scaled) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -981,16 +989,16 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
assert(conv_params->dst != NULL);
}
av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
- &filter_params_x, &filter_params_y,
- subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
- conv_params, bd);
+ filter_params_x, filter_params_y, subpel_x_q4,
+ x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+ bd);
} else {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
0][conv_params->is_compound](
- src, src_stride, dst, dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
}
}
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index 1b2c2d0d5..bc2d4bccf 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -40,27 +40,17 @@ typedef struct ConvolveParams {
typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
typedef void (*aom_highbd_convolve_fn_t)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
- InterpFilterParams *params_x,
- InterpFilterParams *params_y,
- int w, int h) {
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
- *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
- *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
-}
-
struct AV1Common;
struct scale_factors;
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index a37ee9f24..689c25f30 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -557,6 +557,7 @@ typedef uint8_t TXFM_CONTEXT;
#define BWDREF_FRAME 5
#define ALTREF2_FRAME 6
#define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
@@ -607,6 +608,7 @@ typedef enum ATTRIBUTE_PACKED {
// In large_scale_tile coding, external references are used.
#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
#ifdef __cplusplus
} // extern "C"
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
deleted file mode 100644
index a7e67ea4a..000000000
--- a/third_party/aom/av1/common/filter.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/filter.h"
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- bilinear_filters[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
- { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
- { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
- { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
- { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
- { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
- { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
- { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
- { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
- { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
- { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
- { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
- { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
- { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
- { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
- { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
- { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
- { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
- { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
- { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
- { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
- { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
- { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
- { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
- { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
- { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
- { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
- { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
- { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-static const InterpFilterParams
- av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
- { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_REGULAR },
- { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_SMOOTH },
- { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
- MULTITAP_SHARP },
- { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
- BILINEAR }
- };
-
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_4[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
- { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
- { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
- { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
- { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
- { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
- { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
- { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
-};
-DECLARE_ALIGNED(256, static const InterpKernel,
- sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
- { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
- { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
- { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
- { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
- { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
- { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
- { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
-};
-
-static const InterpFilterParams av1_interp_4tap[2] = {
- { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_REGULAR },
- { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
- EIGHTTAP_SMOOTH },
-};
-
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
- const InterpFilter interp_filter, const int w) {
- if (w <= 4 &&
- (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
- return av1_interp_4tap[0];
- else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
- return av1_interp_4tap[1];
-
- return av1_interp_filter_params_list[interp_filter];
-}
-
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
- return (const int16_t *)av1_interp_filter_params_list[interp_filter]
- .filter_ptr;
-}
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 0c24ad9d0..7f8ad583a 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -64,8 +64,8 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
}
-#define LOG_SWITCHABLE_FILTERS \
- 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
#define MAX_SUBPEL_TAPS 12
#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
@@ -79,14 +79,116 @@ typedef struct InterpFilterParams {
InterpFilter interp_filter;
} InterpFilterParams;
-const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
+ { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+ { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
+ { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
+ { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
+ { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
+ { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
+ { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
+ { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
+ { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+ { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
+ { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
+ { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
+ { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+ { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
+ { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
+ { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+ { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+ { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+ { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+ { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+ { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
+ { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+ av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+ SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ MULTITAP_SHARP },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR }
+ };
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
+ { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
+ { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+ { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+ { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+ { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+ { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+ { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+ { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+ const int w) {
+ if (w <= 4) return &av1_interp_4tap[interp_filter];
+ return &av1_interp_filter_params_list[interp_filter];
+}
-InterpFilterParams av1_get_interp_filter_params_with_block_size(
- const InterpFilter interp_filter, const int w);
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+ const InterpFilter interp_filter) {
+ return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}
static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
- const InterpFilterParams filter_params, const int subpel) {
- return filter_params.filter_ptr + filter_params.taps * subpel;
+ const InterpFilterParams *const filter_params, const int subpel) {
+ return filter_params->filter_ptr + filter_params->taps * subpel;
}
#ifdef __cplusplus
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index a6227f18f..c2495640e 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -294,9 +294,6 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
mv->row = clamp(mv->row, min_row, max_row);
}
-static INLINE int mv_has_subpel(const MV *mv) {
- return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
-}
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index 716b4a247..f68c159e1 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -44,7 +44,7 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
assert(b >= 0 && b < (1 << bits));
int diff = a - b;
- int m = 1 << (bits - 1);
+ const int m = 1 << (bits - 1);
diff = (diff & (m - 1)) - (diff & m);
return diff;
}
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index fa5f02e52..6b1bf2d74 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -184,7 +184,10 @@ typedef struct BitstreamLevel {
uint8_t minor;
} BitstreamLevel;
-/* Initial version of sequence header structure */
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
typedef struct SequenceHeader {
int num_bits_width;
int num_bits_height;
@@ -205,7 +208,6 @@ typedef struct SequenceHeader {
// 2 - adaptive
int still_picture; // Video is a single frame still picture
int reduced_still_picture_hdr; // Use reduced header for still picture
- int monochrome; // Monochorme video
int enable_filter_intra; // enables/disables filterintra
int enable_intra_edge_filter; // enables/disables corner/edge/upsampling
int enable_interintra_compound; // enables/disables interintra_compound
@@ -229,6 +231,9 @@ typedef struct SequenceHeader {
// enabled for that frame.
int enable_cdef; // To turn on/off CDEF
int enable_restoration; // To turn on/off loop restoration
+ BITSTREAM_PROFILE profile;
+
+ // Operating point info.
int operating_points_cnt_minus_1;
int operating_point_idc[MAX_NUM_OPERATING_POINTS];
int display_model_info_present_flag;
@@ -236,15 +241,26 @@ typedef struct SequenceHeader {
BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0
// or 1.
-} SequenceHeader;
-typedef struct AV1Common {
- struct aom_internal_error_info error;
+ // Color config.
+ aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
+ // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+ int use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ int monochrome; // Monochorme video
aom_color_primaries_t color_primaries;
aom_transfer_characteristics_t transfer_characteristics;
aom_matrix_coefficients_t matrix_coefficients;
- aom_chroma_sample_position_t chroma_sample_position;
int color_range;
+ int subsampling_x; // Chroma subsampling for x
+ int subsampling_y; // Chroma subsampling for y
+ aom_chroma_sample_position_t chroma_sample_position;
+ int separate_uv_delta_q;
+
+ int film_grain_params_present;
+} SequenceHeader;
+
+typedef struct AV1Common {
+ struct aom_internal_error_info error;
int width;
int height;
int render_width;
@@ -253,18 +269,11 @@ typedef struct AV1Common {
int last_height;
int timing_info_present;
aom_timing_info_t timing_info;
- int buffer_removal_delay_present;
+ int buffer_removal_time_present;
aom_dec_model_info_t buffer_model;
aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
- int tu_presentation_delay_flag;
- int64_t tu_presentation_delay;
-
- // TODO(jkoleszar): this implies chroma ss right now, but could vary per
- // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
- // support additional planes.
- int subsampling_x;
- int subsampling_y;
+ uint32_t frame_presentation_time;
int largest_tile_id;
size_t largest_tile_size;
@@ -273,8 +282,6 @@ typedef struct AV1Common {
// Scale of the current frame with respect to itself.
struct scale_factors sf_identity;
- // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
- int use_highbitdepth;
YV12_BUFFER_CONFIG *frame_to_show;
RefCntBuffer *prev_frame;
@@ -342,8 +349,6 @@ typedef struct AV1Common {
int u_ac_delta_q;
int v_ac_delta_q;
- int separate_uv_delta_q;
-
// The dequantizers below are true dequntizers used only in the
// dequantization process. They have the same coefficient
// shift/scale as TX.
@@ -447,10 +452,7 @@ typedef struct AV1Common {
unsigned int frame_offset;
unsigned int current_video_frame;
- BITSTREAM_PROFILE profile;
- // AOM_BITS_8 in profile 0 or 1, AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
- aom_bit_depth_t bit_depth;
aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
int error_resilient_mode;
@@ -494,9 +496,8 @@ typedef struct AV1Common {
ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
TXFM_CONTEXT **above_txfm_context;
WarpedMotionParams global_motion[REF_FRAMES];
- aom_film_grain_table_t *film_grain_table;
- int film_grain_params_present;
aom_film_grain_t film_grain_params;
+
int cdef_pri_damping;
int cdef_sec_damping;
int nb_cdef_strengths;
@@ -590,7 +591,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
if (frame_bufs[i].ref_count == 0) break;
if (i != FRAME_BUFFERS) {
- if (frame_bufs[i].buf.use_external_refernce_buffers) {
+ if (frame_bufs[i].buf.use_external_reference_buffers) {
// If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
// external reference buffers. Restore the buffer pointers to point to the
// internally allocated memory.
@@ -598,7 +599,7 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
ybf->y_buffer = ybf->store_buf_adr[0];
ybf->u_buffer = ybf->store_buf_adr[1];
ybf->v_buffer = ybf->store_buf_adr[2];
- ybf->use_external_refernce_buffers = 0;
+ ybf->use_external_reference_buffers = 0;
}
frame_bufs[i].ref_count = 1;
@@ -683,15 +684,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
}
}
-static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
- return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-}
-
-static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
- return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-}
-
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
static INLINE int av1_num_planes(const AV1_COMMON *cm) {
return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
@@ -734,7 +727,7 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
}
xd->mi_stride = cm->mi_stride;
xd->error_info = &cm->error;
- cfl_init(&xd->cfl, cm);
+ cfl_init(&xd->cfl, &cm->seq_params);
}
static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1066,17 +1059,18 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
}
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
int mi_col_start, int mi_col_end, const int tile_row) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
const int width = mi_col_end - mi_col_start;
const int aligned_width =
- ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2);
+ ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
const int offset_y = mi_col_start;
const int width_y = aligned_width;
- const int offset_uv = offset_y >> cm->subsampling_x;
- const int width_uv = width_y >> cm->subsampling_x;
+ const int offset_uv = offset_y >> seq_params->subsampling_x;
+ const int width_uv = width_y >> seq_params->subsampling_x;
av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
if (num_planes > 1) {
@@ -1084,7 +1078,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
} else {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
"Invalid value of planes");
}
}
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
index 84575d74b..0e14da7a3 100644
--- a/third_party/aom/av1/common/quant_common.c
+++ b/third_party/aom/av1/common/quant_common.c
@@ -223,29 +223,6 @@ int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
return av1_ac_quant_Q3(qindex, delta, bit_depth);
}
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
- int i;
- const int16_t *tab = ac_qlookup_Q3;
- switch (bit_depth) {
- case AOM_BITS_10: {
- tab = ac_qlookup_10_Q3;
- break;
- }
- case AOM_BITS_12: {
- tab = ac_qlookup_12_Q3;
- break;
- }
- default:
- assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
- return -1;
- }
- (void)bit_depth;
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (ac_Q3 <= tab[i]) return i;
- }
- return QINDEX_RANGE - 1;
-}
-
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex) {
if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index f9681036d..ca199e94c 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -42,7 +42,6 @@ int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex);
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b6ac436fb..b9f0b57f3 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -627,9 +627,7 @@ void av1_make_masked_inter_predictor(
tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
#undef INTER_PRED_BYTES_PER_PIXEL
- uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- ? CONVERT_TO_BYTEPTR(tmp_buf)
- : tmp_buf;
+ uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
const int tmp_buf_stride = MAX_SB_SIZE;
CONV_BUF_TYPE *org_dst = conv_params->dst;
@@ -1002,8 +1000,8 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
{ xd->plane[0].dst.stride, 0, 0 } };
if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, ctx, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, ctx, 0, bsize);
}
}
@@ -1609,10 +1607,10 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
const int ssy = xd->plane[plane].subsampling_y;
BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
- xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0;
- xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0;
- xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0;
- xd->mi[0]->use_intrabc = 0;
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+ assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+ assert(xd->mi[0]->use_intrabc == 0);
av1_predict_intra_block(cm, xd, pd->width, pd->height,
max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
@@ -1642,42 +1640,23 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
inter_pred, inter_stride, intra_pred, intra_stride);
}
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, int ystride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
- av1_build_intra_predictors_for_interintra(
- cm, xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, 0, ypred, ystride,
- CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
- return;
- }
- {
- DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx,
- intrapredictor, MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
- MAX_SB_SIZE);
- }
-}
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(
- cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor),
+ cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, plane, upred, ustride,
- CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride,
+ CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
} else {
- DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
- uintrapredictor, MAX_SB_SIZE);
- av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
+ intrapredictor, MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
MAX_SB_SIZE);
}
}
@@ -1686,8 +1665,8 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *upred, uint8_t *vpred,
int ustride, int vstride,
BUFFER_SET *ctx, BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sbc(cm, xd, upred, ustride, ctx, 1, bsize);
- av1_build_interintra_predictors_sbc(cm, xd, vpred, vstride, ctx, 2, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
}
void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1695,7 +1674,7 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *vpred, int ystride, int ustride,
int vstride, BUFFER_SET *ctx,
BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sby(cm, xd, ypred, ystride, ctx, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
ctx, bsize);
}
@@ -1713,9 +1692,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
struct buf_2d *const pre_buf = &pd->pre[ref];
- const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
- uint8_t *const dst =
- (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
const MV mv = mi->mv[ref].as_mv;
ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index aa3aefc88..6a3def270 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -412,12 +412,9 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
int vstride, BUFFER_SET *ctx,
BLOCK_SIZE bsize);
-void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, int ystride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize);
-
-void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, int ustride,
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize);
@@ -429,6 +426,7 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
void av1_build_intra_predictors_for_interintra(
const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+
void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index 21d1f60b2..71a52e73e 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -1071,13 +1071,6 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
p_left[-1] = s;
}
-static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) {
- const int d = abs(delta);
- const int blk_wh = bs0 + bs1;
- if (d <= 0 || d >= 40) return 0;
- return type ? (blk_wh <= 8) : (blk_wh <= 16);
-}
-
void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
// interpolate half-sample positions
assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1284,13 +1277,13 @@ static void build_intra_predictors_high(
}
}
upsample_above =
- use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
}
upsample_left =
- use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
@@ -1467,13 +1460,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
}
}
upsample_above =
- use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
if (need_above && upsample_above) {
const int n_px = txwpx + (need_right ? txhpx : 0);
av1_upsample_intra_edge(above_row, n_px);
}
upsample_left =
- use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
if (need_left && upsample_left) {
const int n_px = txhpx + (need_bottom ? txwpx : 0);
av1_upsample_intra_edge(left_col, n_px);
@@ -1642,4 +1635,6 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
dst_stride, dst, dst_stride, blk_col, blk_row, plane);
}
-void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); }
+void av1_init_intra_predictors(void) {
+ aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index a7d9e8b79..57638f24e 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -12,6 +12,8 @@
#ifndef AV1_COMMON_RECONINTRA_H_
#define AV1_COMMON_RECONINTRA_H_
+#include <stdlib.h>
+
#include "aom/aom_integer.h"
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
@@ -103,6 +105,14 @@ static INLINE int av1_get_dy(int angle) {
return 1;
}
}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+ int type) {
+ const int d = abs(delta);
+ const int blk_wh = bs0 + bs1;
+ if (d <= 0 || d >= 40) return 0;
+ return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 17e6823b1..93d62292a 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -1100,7 +1100,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
int plane, int rows) {
const int is_uv = (plane > 0);
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
const int upscaled_plane_width =
ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1141,10 +1141,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
const int pad_left = (j == 0);
const int pad_right = (j == cm->tile_cols - 1);
- if (cm->use_highbitdepth)
- highbd_upscale_normative_rect(
- src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
- dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth);
+ if (cm->seq_params.use_highbitdepth)
+ highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+ dst_ptr, rows, dst_width, dst_stride,
+ x_step_qn, x0_qn, pad_left, pad_right,
+ cm->seq_params.bit_depth);
else
upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
rows, dst_width, dst_stride, x_step_qn, x0_qn,
@@ -1175,7 +1176,7 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
const int num_planes = av1_num_planes(cm);
if (cm->width != unscaled->y_crop_width ||
cm->height != unscaled->y_crop_height) {
- av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+ av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
num_planes);
return scaled;
} else {
@@ -1232,6 +1233,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
const int num_planes = av1_num_planes(cm);
if (!av1_superres_scaled(cm)) return;
+ const SequenceHeader *const seq_params = &cm->seq_params;
YV12_BUFFER_CONFIG copy_buffer;
memset(&copy_buffer, 0, sizeof(copy_buffer));
@@ -1239,10 +1241,10 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
- if (aom_alloc_frame_buffer(&copy_buffer, aligned_width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment))
+ if (aom_alloc_frame_buffer(
+ &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate copy buffer for superres upscaling");
@@ -1269,11 +1271,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
"Failed to free current frame buffer before superres upscaling");
// aom_realloc_frame_buffer() leaves config data for frame_to_show intact
- if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
- cm->superres_upscaled_height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, fb, cb, cb_priv))
+ if (aom_realloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate current frame buffer for superres upscaling");
@@ -1283,10 +1285,11 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
// Don't use callbacks on the encoder.
// aom_alloc_frame_buffer() clears the config data for frame_to_show
- if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
- cm->superres_upscaled_height, cm->subsampling_x,
- cm->subsampling_y, cm->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ if (aom_alloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to reallocate current frame buffer for superres upscaling");
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 58a5275ca..632967957 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -42,8 +42,8 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
AV1PixelRect rect;
- int ss_x = is_uv && cm->subsampling_x;
- int ss_y = is_uv && cm->subsampling_y;
+ int ss_x = is_uv && cm->seq_params.subsampling_x;
+ int ss_y = is_uv && cm->seq_params.subsampling_y;
rect.top = 0;
rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1146,16 +1146,17 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int optimized_lr,
int num_planes) {
- const int bit_depth = cm->bit_depth;
- const int highbd = cm->use_highbitdepth;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int bit_depth = seq_params->bit_depth;
+ const int highbd = seq_params->use_highbitdepth;
lr_ctxt->dst = &cm->rst_frame;
const int frame_width = frame->crop_widths[0];
const int frame_height = frame->crop_heights[0];
- if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height,
- cm->subsampling_x, cm->subsampling_y,
- cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL) < 0)
+ if (aom_realloc_frame_buffer(
+ lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL) < 0)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate restoration dst buffer");
@@ -1180,8 +1181,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
highbd);
lr_plane_ctxt->rsi = rsi;
- lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x;
- lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y;
+ lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+ lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
lr_plane_ctxt->highbd = highbd;
lr_plane_ctxt->bit_depth = bit_depth;
lr_plane_ctxt->data8 = frame->buffers[plane];
@@ -1337,7 +1338,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int32_t *tmpbuf,
RestorationLineBuffers *rlbs) {
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const RestorationInfo *rsi = &cm->rst_info[plane];
@@ -1350,7 +1351,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
- int *rrow1, int *tile_tl_idx) {
+ int *rrow1) {
assert(rcol0 && rcol1 && rrow0 && rrow1);
if (bsize != cm->seq_params.sb_size) return 0;
@@ -1383,8 +1384,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
// The size of an MI-unit on this plane of the image
- const int ss_x = is_uv && cm->subsampling_x;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const int mi_size_x = MI_SIZE >> ss_x;
const int mi_size_y = MI_SIZE >> ss_y;
@@ -1419,9 +1420,6 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
*rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
*rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
- const int tile_idx = 0;
- *tile_tl_idx = tile_idx * rsi->units_per_tile;
-
return *rcol0 < *rcol1 && *rrow0 < *rrow1;
}
@@ -1468,7 +1466,7 @@ static void save_deblock_boundary_lines(
int upscaled_width;
int line_bytes;
if (av1_superres_scaled(cm)) {
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
line_bytes = upscaled_width << use_highbd;
if (use_highbd)
@@ -1515,7 +1513,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
// At the point where this function is called, we've already applied
// superres. So we don't need to extend the lines here, we can just
// pull directly from the topmost row of the upscaled frame.
- const int ss_x = is_uv && cm->subsampling_x;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
const int upscaled_width = av1_superres_scaled(cm)
? (cm->superres_upscaled_width + ss_x) >> ss_x
: src_width;
@@ -1535,7 +1533,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
int use_highbd, int plane,
AV1_COMMON *cm, int after_cdef) {
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
@@ -1600,7 +1598,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int after_cdef) {
const int num_planes = av1_num_planes(cm);
- const int use_highbd = cm->use_highbitdepth;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
for (int p = 0; p < num_planes; ++p) {
save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index 0c4017534..aec37d834 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -346,7 +346,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rcol0, int *rcol1, int *rrow0,
- int *rrow1, int *tile_tl_idx);
+ int *rrow1);
void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
struct AV1Common *cm,
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index c5cebc135..d206586b5 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -39,13 +39,6 @@ extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
-static INLINE int get_coef_context(const int16_t *neighbors,
- const uint8_t *token_cache, int c) {
- return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
- token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >>
- 1;
-}
-
static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
TX_TYPE tx_type) {
return &av1_scan_orders[tx_size][tx_type];
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index 3fa998a91..f9b734b8c 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -572,7 +572,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
for (int plane = 0; plane < num_planes; plane++) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
const int is_uv = plane > 0;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
AV1PixelRect tile_rect = ctxt[plane].tile_rect;
const int unit_size = ctxt[plane].rsi->restoration_unit_size;
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 9a43ab29a..026c904b6 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -179,8 +179,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
r.bottom = AOMMIN(r.bottom, frame_h);
// Convert to coordinates in the appropriate plane
- const int ss_x = is_uv && cm->subsampling_x;
- const int ss_y = is_uv && cm->subsampling_y;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
index 5ff538ae1..49dbde78f 100644
--- a/third_party/aom/av1/common/timing.c
+++ b/third_party/aom/av1/common/timing.c
@@ -53,8 +53,8 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
decoder_model->encoder_decoder_buffer_delay_length = 16;
- decoder_model->buffer_removal_delay_length = 10;
- decoder_model->frame_presentation_delay_length = 10;
+ decoder_model->buffer_removal_time_length = 10;
+ decoder_model->frame_presentation_time_length = 10;
}
void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index d31f4b7fc..1749baa57 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -27,23 +27,23 @@ typedef struct aom_timing {
typedef struct aom_dec_model_info {
uint32_t num_units_in_decoding_tick;
int encoder_decoder_buffer_delay_length;
- int buffer_removal_delay_length;
- int frame_presentation_delay_length;
+ int buffer_removal_time_length;
+ int frame_presentation_time_length;
} aom_dec_model_info_t;
typedef struct aom_dec_model_op_parameters {
int decoder_model_param_present_flag;
int64_t bitrate;
int64_t buffer_size;
- int decoder_buffer_delay;
- int encoder_buffer_delay;
+ uint32_t decoder_buffer_delay;
+ uint32_t encoder_buffer_delay;
int low_delay_mode_flag;
int display_model_param_present_flag;
int initial_display_delay;
} aom_dec_model_op_parameters_t;
typedef struct aom_op_timing_info_t {
- int64_t buffer_removal_delay;
+ uint32_t buffer_removal_time;
} aom_op_timing_info_t;
void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index cdac90d9e..f0ab79d0f 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -466,31 +466,6 @@ static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
return mag;
}
-static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
- const TX_CLASS tx_class) {
- int count;
-
- count = (levels[1] != 0); // { 0, 1 }
- count += (levels[(1 << bwl) + TX_PAD_HOR] != 0); // { 1, 0 }
-
- for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
- const int row_offset =
- ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
- : ((tx_class == TX_CLASS_VERT)
- ? sig_ref_diff_offset_vert[idx][0]
- : sig_ref_diff_offset_horiz[idx][0]));
- const int col_offset =
- ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
- : ((tx_class == TX_CLASS_VERT)
- ? sig_ref_diff_offset_vert[idx][1]
- : sig_ref_diff_offset_horiz[idx][1]));
- const int nb_pos =
- (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
- count += (levels[nb_pos] != 0);
- }
- return count;
-}
-
#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index ae6f07657..412d83ed8 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -92,33 +92,6 @@ static const int error_measure_lut[512] = {
};
/* clang-format on */
-void project_points_affine(const int32_t *mat, int *points, int *proj,
- const int n, const int stride_points,
- const int stride_proj, const int subsampling_x,
- const int subsampling_y) {
- for (int i = 0; i < n; ++i) {
- const int x = *(points++), y = *(points++);
- if (subsampling_x)
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- WARPEDDIFF_PREC_BITS + 1);
- else
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
- WARPEDDIFF_PREC_BITS);
- if (subsampling_y)
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- WARPEDDIFF_PREC_BITS + 1);
- else
- *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
- WARPEDDIFF_PREC_BITS);
- points += stride_points - 2;
- proj += stride_proj - 2;
- }
-}
-
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
// at a time. The zoom/rotation/shear in the model are applied to the
// "fractional" position of each pixel, which therefore varies within
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index f5da36bbb..ce4032ee5 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -68,11 +68,6 @@ static const uint8_t warp_pad_right[14][16] = {
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
};
-void project_points_affine(const int32_t *mat, int *points, int *proj,
- const int n, const int stride_points,
- const int stride_proj, const int subsampling_x,
- const int subsampling_y);
-
// Returns the error between the result of applying motion 'wm' to the frame
// described by 'ref' and the frame described by 'dst'.
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 6747cae01..0c5286f9d 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -39,7 +39,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
// Load the filter coefficients
const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -140,7 +140,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
int x;
@@ -232,8 +232,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
}
void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
@@ -278,7 +278,7 @@ static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
// Load the filter coefficients
const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
@@ -372,7 +372,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
int x;
@@ -472,8 +472,8 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
void av1_highbd_convolve_2d_scale_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params, int bd) {
// TODO(yaowu): Move this out of stack
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index 7415c58df..ae331b40d 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -19,49 +19,47 @@
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x1[0], x1[3]);
- btf_16_adds_subs_avx2(x1[1], x1[2]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
-
- btf_16_adds_subs_avx2(x1[8], x1[11]);
- btf_16_adds_subs_avx2(x1[9], x1[10]);
- btf_16_subs_adds_avx2(x1[15], x1[12]);
- btf_16_subs_adds_avx2(x1[14], x1[13]);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
}
static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
}
static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
- btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
- btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
- btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
- btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
- btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
- btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
- btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
- btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
}
static void idct16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
__m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
@@ -103,29 +101,29 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output,
x1[15] = input[15];
// stage 2
- btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
- btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
- btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
// stage 3
- btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
- btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
// stage 4
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
-
- idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
- idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
idct16_stage7_avx2(output, x1);
}
@@ -133,7 +131,7 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
@@ -159,21 +157,21 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
// stage 3
btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
// stage 4
btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
- idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
- idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
idct16_stage7_avx2(output, x1);
}
@@ -212,74 +210,71 @@ static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
}
static INLINE void iadst16_stage3_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[8]);
- btf_16_adds_subs_avx2(x[1], x[9]);
- btf_16_adds_subs_avx2(x[2], x[10]);
- btf_16_adds_subs_avx2(x[3], x[11]);
- btf_16_adds_subs_avx2(x[4], x[12]);
- btf_16_adds_subs_avx2(x[5], x[13]);
- btf_16_adds_subs_avx2(x[6], x[14]);
- btf_16_adds_subs_avx2(x[7], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[8]);
+ btf_16_adds_subs_avx2(&x[1], &x[9]);
+ btf_16_adds_subs_avx2(&x[2], &x[10]);
+ btf_16_adds_subs_avx2(&x[3], &x[11]);
+ btf_16_adds_subs_avx2(&x[4], &x[12]);
+ btf_16_adds_subs_avx2(&x[5], &x[13]);
+ btf_16_adds_subs_avx2(&x[6], &x[14]);
+ btf_16_adds_subs_avx2(&x[7], &x[15]);
}
static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
- btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
- btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
- btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
- btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
}
static INLINE void iadst16_stage5_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[4]);
- btf_16_adds_subs_avx2(x[1], x[5]);
- btf_16_adds_subs_avx2(x[2], x[6]);
- btf_16_adds_subs_avx2(x[3], x[7]);
- btf_16_adds_subs_avx2(x[8], x[12]);
- btf_16_adds_subs_avx2(x[9], x[13]);
- btf_16_adds_subs_avx2(x[10], x[14]);
- btf_16_adds_subs_avx2(x[11], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[4]);
+ btf_16_adds_subs_avx2(&x[1], &x[5]);
+ btf_16_adds_subs_avx2(&x[2], &x[6]);
+ btf_16_adds_subs_avx2(&x[3], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[12]);
+ btf_16_adds_subs_avx2(&x[9], &x[13]);
+ btf_16_adds_subs_avx2(&x[10], &x[14]);
+ btf_16_adds_subs_avx2(&x[11], &x[15]);
}
static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
- btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
- btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
}
static INLINE void iadst16_stage7_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[0], x[2]);
- btf_16_adds_subs_avx2(x[1], x[3]);
- btf_16_adds_subs_avx2(x[4], x[6]);
- btf_16_adds_subs_avx2(x[5], x[7]);
- btf_16_adds_subs_avx2(x[8], x[10]);
- btf_16_adds_subs_avx2(x[9], x[11]);
- btf_16_adds_subs_avx2(x[12], x[14]);
- btf_16_adds_subs_avx2(x[13], x[15]);
+ btf_16_adds_subs_avx2(&x[0], &x[2]);
+ btf_16_adds_subs_avx2(&x[1], &x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[6]);
+ btf_16_adds_subs_avx2(&x[5], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[10]);
+ btf_16_adds_subs_avx2(&x[9], &x[11]);
+ btf_16_adds_subs_avx2(&x[12], &x[14]);
+ btf_16_adds_subs_avx2(&x[13], &x[15]);
}
static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
}
static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
@@ -307,7 +302,7 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
__m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
@@ -346,21 +341,21 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
x1[15] = input[14];
// stage 2
- btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
- btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
- btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
- btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
- btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
- btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
- btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
iadst16_stage3_avx2(x1);
- iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
iadst16_stage5_avx2(x1);
- iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
iadst16_stage7_avx2(x1);
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
@@ -368,7 +363,7 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x1[16];
@@ -392,11 +387,11 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
iadst16_stage3_avx2(x1);
- iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
iadst16_stage5_avx2(x1);
- iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
iadst16_stage7_avx2(x1);
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
@@ -404,7 +399,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
@@ -423,7 +418,7 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[9] = x1[1];
// stage 4
- btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
// stage 5
x1[4] = x1[0];
@@ -433,8 +428,8 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[13] = x1[9];
// stage 6
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
- btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
// stage 7
x1[2] = x1[0];
@@ -446,130 +441,125 @@ static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
x1[14] = x1[12];
x1[15] = x1[13];
- iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
iadst16_stage9_avx2(output, x1);
}
static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
- btf_16_adds_subs_avx2(x[16], x[17]);
- btf_16_subs_adds_avx2(x[19], x[18]);
- btf_16_adds_subs_avx2(x[20], x[21]);
- btf_16_subs_adds_avx2(x[23], x[22]);
- btf_16_adds_subs_avx2(x[24], x[25]);
- btf_16_subs_adds_avx2(x[27], x[26]);
- btf_16_adds_subs_avx2(x[28], x[29]);
- btf_16_subs_adds_avx2(x[31], x[30]);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
}
static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
}
static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- btf_16_adds_subs_avx2(x[16], x[19]);
- btf_16_adds_subs_avx2(x[17], x[18]);
- btf_16_subs_adds_avx2(x[23], x[20]);
- btf_16_subs_adds_avx2(x[22], x[21]);
- btf_16_adds_subs_avx2(x[24], x[27]);
- btf_16_adds_subs_avx2(x[25], x[26]);
- btf_16_subs_adds_avx2(x[31], x[28]);
- btf_16_subs_adds_avx2(x[30], x[29]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
}
static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
}
static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- btf_16_adds_subs_avx2(x[16], x[23]);
- btf_16_adds_subs_avx2(x[17], x[22]);
- btf_16_adds_subs_avx2(x[18], x[21]);
- btf_16_adds_subs_avx2(x[19], x[20]);
- btf_16_subs_adds_avx2(x[31], x[24]);
- btf_16_subs_adds_avx2(x[30], x[25]);
- btf_16_subs_adds_avx2(x[29], x[26]);
- btf_16_subs_adds_avx2(x[28], x[27]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
}
static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[15]);
- btf_16_adds_subs_avx2(x[1], x[14]);
- btf_16_adds_subs_avx2(x[2], x[13]);
- btf_16_adds_subs_avx2(x[3], x[12]);
- btf_16_adds_subs_avx2(x[4], x[11]);
- btf_16_adds_subs_avx2(x[5], x[10]);
- btf_16_adds_subs_avx2(x[6], x[9]);
- btf_16_adds_subs_avx2(x[7], x[8]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
}
static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
- btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
- btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
- btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
- btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
- btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
- btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
- btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
- btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
- btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
- btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
- btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
- btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
- btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
- btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
- btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
- btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
}
static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -629,7 +619,7 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x[32];
@@ -666,20 +656,20 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[11];
x[13] = x[12];
x[14] = x[15];
- idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
x[5] = x[4];
x[6] = x[7];
- idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
// stage 6
x[3] = x[0];
x[2] = x[1];
- idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
- idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x);
}
@@ -687,7 +677,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
// stage 1
__m256i x[32];
@@ -728,25 +718,25 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
// stage 4
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[9]);
- btf_16_subs_adds_avx2(x[11], x[10]);
- btf_16_adds_subs_avx2(x[12], x[13]);
- btf_16_subs_adds_avx2(x[15], x[14]);
- idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
- btf_16_adds_subs_avx2(x[4], x[5]);
- btf_16_subs_adds_avx2(x[7], x[6]);
- idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
- btf_16_adds_subs_avx2(x[0], x[3]);
- btf_16_adds_subs_avx2(x[1], x[2]);
- idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
- idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x);
}
@@ -754,7 +744,7 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
__m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
__m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
@@ -825,51 +815,50 @@ static void idct32_new_avx2(const __m256i *input, __m256i *output,
x1[31] = input[31];
// stage 2
- btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
- btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
- btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
- btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
- btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
- btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
- btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
- btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
// stage 3
- btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
- btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
- btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
- btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
idct32_high16_stage3_avx2(x1);
// stage 4
- btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
- btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
- btf_16_adds_subs_avx2(x1[8], x1[9]);
- btf_16_subs_adds_avx2(x1[11], x1[10]);
- btf_16_adds_subs_avx2(x1[12], x1[13]);
- btf_16_subs_adds_avx2(x1[15], x1[14]);
- idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
// stage 5
- btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
- btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
- btf_16_adds_subs_avx2(x1[4], x1[5]);
- btf_16_subs_adds_avx2(x1[7], x1[6]);
- idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
// stage 6
- btf_16_adds_subs_avx2(x1[0], x1[3]);
- btf_16_adds_subs_avx2(x1[1], x1[2]);
- idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
- idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
- idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x1, cospi, _r, cos_bit);
idct32_stage9_avx2(output, x1);
}
static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
@@ -883,19 +872,18 @@ static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
- btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
- btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
- btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
- btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
}
static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -903,31 +891,30 @@ static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
- btf_16_adds_subs_avx2(x[32], x[35]);
- btf_16_adds_subs_avx2(x[33], x[34]);
- btf_16_subs_adds_avx2(x[39], x[36]);
- btf_16_subs_adds_avx2(x[38], x[37]);
- btf_16_adds_subs_avx2(x[40], x[43]);
- btf_16_adds_subs_avx2(x[41], x[42]);
- btf_16_subs_adds_avx2(x[47], x[44]);
- btf_16_subs_adds_avx2(x[46], x[45]);
- btf_16_adds_subs_avx2(x[48], x[51]);
- btf_16_adds_subs_avx2(x[49], x[50]);
- btf_16_subs_adds_avx2(x[55], x[52]);
- btf_16_subs_adds_avx2(x[54], x[53]);
- btf_16_adds_subs_avx2(x[56], x[59]);
- btf_16_adds_subs_avx2(x[57], x[58]);
- btf_16_subs_adds_avx2(x[63], x[60]);
- btf_16_subs_adds_avx2(x[62], x[61]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[35]);
+ btf_16_adds_subs_avx2(&x[33], &x[34]);
+ btf_16_adds_subs_avx2(&x[39], &x[36]);
+ btf_16_adds_subs_avx2(&x[38], &x[37]);
+ btf_16_adds_subs_avx2(&x[40], &x[43]);
+ btf_16_adds_subs_avx2(&x[41], &x[42]);
+ btf_16_adds_subs_avx2(&x[47], &x[44]);
+ btf_16_adds_subs_avx2(&x[46], &x[45]);
+ btf_16_adds_subs_avx2(&x[48], &x[51]);
+ btf_16_adds_subs_avx2(&x[49], &x[50]);
+ btf_16_adds_subs_avx2(&x[55], &x[52]);
+ btf_16_adds_subs_avx2(&x[54], &x[53]);
+ btf_16_adds_subs_avx2(&x[56], &x[59]);
+ btf_16_adds_subs_avx2(&x[57], &x[58]);
+ btf_16_adds_subs_avx2(&x[63], &x[60]);
+ btf_16_adds_subs_avx2(&x[62], &x[61]);
}
static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
@@ -935,185 +922,180 @@ static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
- btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
}
static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
- btf_16_adds_subs_avx2(x[16], x[19]);
- btf_16_adds_subs_avx2(x[17], x[18]);
- btf_16_subs_adds_avx2(x[23], x[20]);
- btf_16_subs_adds_avx2(x[22], x[21]);
- btf_16_adds_subs_avx2(x[24], x[27]);
- btf_16_adds_subs_avx2(x[25], x[26]);
- btf_16_subs_adds_avx2(x[31], x[28]);
- btf_16_subs_adds_avx2(x[30], x[29]);
- idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+ const __m256i _r, int8_t cos_bit) {
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
}
static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
- btf_16_adds_subs_avx2(x[32], x[39]);
- btf_16_adds_subs_avx2(x[33], x[38]);
- btf_16_adds_subs_avx2(x[34], x[37]);
- btf_16_adds_subs_avx2(x[35], x[36]);
- btf_16_subs_adds_avx2(x[47], x[40]);
- btf_16_subs_adds_avx2(x[46], x[41]);
- btf_16_subs_adds_avx2(x[45], x[42]);
- btf_16_subs_adds_avx2(x[44], x[43]);
- btf_16_adds_subs_avx2(x[48], x[55]);
- btf_16_adds_subs_avx2(x[49], x[54]);
- btf_16_adds_subs_avx2(x[50], x[53]);
- btf_16_adds_subs_avx2(x[51], x[52]);
- btf_16_subs_adds_avx2(x[63], x[56]);
- btf_16_subs_adds_avx2(x[62], x[57]);
- btf_16_subs_adds_avx2(x[61], x[58]);
- btf_16_subs_adds_avx2(x[60], x[59]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[39]);
+ btf_16_adds_subs_avx2(&x[33], &x[38]);
+ btf_16_adds_subs_avx2(&x[34], &x[37]);
+ btf_16_adds_subs_avx2(&x[35], &x[36]);
+ btf_16_adds_subs_avx2(&x[47], &x[40]);
+ btf_16_adds_subs_avx2(&x[46], &x[41]);
+ btf_16_adds_subs_avx2(&x[45], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[43]);
+ btf_16_adds_subs_avx2(&x[48], &x[55]);
+ btf_16_adds_subs_avx2(&x[49], &x[54]);
+ btf_16_adds_subs_avx2(&x[50], &x[53]);
+ btf_16_adds_subs_avx2(&x[51], &x[52]);
+ btf_16_adds_subs_avx2(&x[63], &x[56]);
+ btf_16_adds_subs_avx2(&x[62], &x[57]);
+ btf_16_adds_subs_avx2(&x[61], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[59]);
}
static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
- btf_16_adds_subs_avx2(x[16], x[23]);
- btf_16_adds_subs_avx2(x[17], x[22]);
- btf_16_adds_subs_avx2(x[18], x[21]);
- btf_16_adds_subs_avx2(x[19], x[20]);
- btf_16_subs_adds_avx2(x[31], x[24]);
- btf_16_subs_adds_avx2(x[30], x[25]);
- btf_16_subs_adds_avx2(x[29], x[26]);
- btf_16_subs_adds_avx2(x[28], x[27]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
}
static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[15]);
- btf_16_adds_subs_avx2(x[1], x[14]);
- btf_16_adds_subs_avx2(x[2], x[13]);
- btf_16_adds_subs_avx2(x[3], x[12]);
- btf_16_adds_subs_avx2(x[4], x[11]);
- btf_16_adds_subs_avx2(x[5], x[10]);
- btf_16_adds_subs_avx2(x[6], x[9]);
- btf_16_adds_subs_avx2(x[7], x[8]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
- btf_16_adds_subs_avx2(x[32], x[47]);
- btf_16_adds_subs_avx2(x[33], x[46]);
- btf_16_adds_subs_avx2(x[34], x[45]);
- btf_16_adds_subs_avx2(x[35], x[44]);
- btf_16_adds_subs_avx2(x[36], x[43]);
- btf_16_adds_subs_avx2(x[37], x[42]);
- btf_16_adds_subs_avx2(x[38], x[41]);
- btf_16_adds_subs_avx2(x[39], x[40]);
- btf_16_subs_adds_avx2(x[63], x[48]);
- btf_16_subs_adds_avx2(x[62], x[49]);
- btf_16_subs_adds_avx2(x[61], x[50]);
- btf_16_subs_adds_avx2(x[60], x[51]);
- btf_16_subs_adds_avx2(x[59], x[52]);
- btf_16_subs_adds_avx2(x[58], x[53]);
- btf_16_subs_adds_avx2(x[57], x[54]);
- btf_16_subs_adds_avx2(x[56], x[55]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[47]);
+ btf_16_adds_subs_avx2(&x[33], &x[46]);
+ btf_16_adds_subs_avx2(&x[34], &x[45]);
+ btf_16_adds_subs_avx2(&x[35], &x[44]);
+ btf_16_adds_subs_avx2(&x[36], &x[43]);
+ btf_16_adds_subs_avx2(&x[37], &x[42]);
+ btf_16_adds_subs_avx2(&x[38], &x[41]);
+ btf_16_adds_subs_avx2(&x[39], &x[40]);
+ btf_16_adds_subs_avx2(&x[63], &x[48]);
+ btf_16_adds_subs_avx2(&x[62], &x[49]);
+ btf_16_adds_subs_avx2(&x[61], &x[50]);
+ btf_16_adds_subs_avx2(&x[60], &x[51]);
+ btf_16_adds_subs_avx2(&x[59], &x[52]);
+ btf_16_adds_subs_avx2(&x[58], &x[53]);
+ btf_16_adds_subs_avx2(&x[57], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[55]);
}
static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
- const __m256i __rounding,
- int8_t cos_bit) {
+ const __m256i _r, int8_t cos_bit) {
(void)cos_bit;
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
- btf_16_adds_subs_avx2(x[0], x[31]);
- btf_16_adds_subs_avx2(x[1], x[30]);
- btf_16_adds_subs_avx2(x[2], x[29]);
- btf_16_adds_subs_avx2(x[3], x[28]);
- btf_16_adds_subs_avx2(x[4], x[27]);
- btf_16_adds_subs_avx2(x[5], x[26]);
- btf_16_adds_subs_avx2(x[6], x[25]);
- btf_16_adds_subs_avx2(x[7], x[24]);
- btf_16_adds_subs_avx2(x[8], x[23]);
- btf_16_adds_subs_avx2(x[9], x[22]);
- btf_16_adds_subs_avx2(x[10], x[21]);
- btf_16_adds_subs_avx2(x[11], x[20]);
- btf_16_adds_subs_avx2(x[12], x[19]);
- btf_16_adds_subs_avx2(x[13], x[18]);
- btf_16_adds_subs_avx2(x[14], x[17]);
- btf_16_adds_subs_avx2(x[15], x[16]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+ btf_16_adds_subs_avx2(&x[0], &x[31]);
+ btf_16_adds_subs_avx2(&x[1], &x[30]);
+ btf_16_adds_subs_avx2(&x[2], &x[29]);
+ btf_16_adds_subs_avx2(&x[3], &x[28]);
+ btf_16_adds_subs_avx2(&x[4], &x[27]);
+ btf_16_adds_subs_avx2(&x[5], &x[26]);
+ btf_16_adds_subs_avx2(&x[6], &x[25]);
+ btf_16_adds_subs_avx2(&x[7], &x[24]);
+ btf_16_adds_subs_avx2(&x[8], &x[23]);
+ btf_16_adds_subs_avx2(&x[9], &x[22]);
+ btf_16_adds_subs_avx2(&x[10], &x[21]);
+ btf_16_adds_subs_avx2(&x[11], &x[20]);
+ btf_16_adds_subs_avx2(&x[12], &x[19]);
+ btf_16_adds_subs_avx2(&x[13], &x[18]);
+ btf_16_adds_subs_avx2(&x[14], &x[17]);
+ btf_16_adds_subs_avx2(&x[15], &x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
}
static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
- btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
- btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
- btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
- btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
- btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
- btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
- btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
- btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
- btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
- btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
- btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
- btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
- btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
- btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
- btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
- btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
- btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
- btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
- btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
- btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
- btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
- btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
- btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
- btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
- btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
- btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
- btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
- btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
- btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
- btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
- btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
- btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+ btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
}
static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
@@ -1207,7 +1189,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
@@ -1260,16 +1242,16 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[22] = x[23];
x[25] = x[24];
x[30] = x[31];
- btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
- btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
- btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
- btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
// stage 5
x[9] = x[8];
x[14] = x[15];
- btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
- btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
x[35] = x[32];
x[34] = x[33];
x[36] = x[39];
@@ -1289,7 +1271,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
x[19] = x[16];
x[18] = x[17];
x[20] = x[23];
@@ -1298,7 +1280,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[26] = x[25];
x[28] = x[31];
x[29] = x[30];
- idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
// stage 7
x[3] = x[0];
@@ -1307,7 +1289,7 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[9];
x[12] = x[15];
x[13] = x[14];
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
x[7] = x[0];
@@ -1315,12 +1297,12 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[5] = x[2];
x[4] = x[3];
x[9] = x[9];
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1328,7 +1310,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1398,7 +1380,7 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
x[26] = x[27];
x[29] = x[28];
x[30] = x[31];
- idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
@@ -1406,37 +1388,37 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
x[10] = x[11];
x[13] = x[12];
x[14] = x[15];
- idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
x[5] = x[4];
x[6] = x[7];
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
// stage 7
x[3] = x[0];
x[2] = x[1];
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
-
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1444,7 +1426,7 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
- const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
@@ -1514,78 +1496,78 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
- btf_16_adds_subs_avx2(x[32], x[33]);
- btf_16_subs_adds_avx2(x[35], x[34]);
- btf_16_adds_subs_avx2(x[36], x[37]);
- btf_16_subs_adds_avx2(x[39], x[38]);
- btf_16_adds_subs_avx2(x[40], x[41]);
- btf_16_subs_adds_avx2(x[43], x[42]);
- btf_16_adds_subs_avx2(x[44], x[45]);
- btf_16_subs_adds_avx2(x[47], x[46]);
- btf_16_adds_subs_avx2(x[48], x[49]);
- btf_16_subs_adds_avx2(x[51], x[50]);
- btf_16_adds_subs_avx2(x[52], x[53]);
- btf_16_subs_adds_avx2(x[55], x[54]);
- btf_16_adds_subs_avx2(x[56], x[57]);
- btf_16_subs_adds_avx2(x[59], x[58]);
- btf_16_adds_subs_avx2(x[60], x[61]);
- btf_16_subs_adds_avx2(x[63], x[62]);
+ btf_16_adds_subs_avx2(&x[32], &x[33]);
+ btf_16_adds_subs_avx2(&x[35], &x[34]);
+ btf_16_adds_subs_avx2(&x[36], &x[37]);
+ btf_16_adds_subs_avx2(&x[39], &x[38]);
+ btf_16_adds_subs_avx2(&x[40], &x[41]);
+ btf_16_adds_subs_avx2(&x[43], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[45]);
+ btf_16_adds_subs_avx2(&x[47], &x[46]);
+ btf_16_adds_subs_avx2(&x[48], &x[49]);
+ btf_16_adds_subs_avx2(&x[51], &x[50]);
+ btf_16_adds_subs_avx2(&x[52], &x[53]);
+ btf_16_adds_subs_avx2(&x[55], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[57]);
+ btf_16_adds_subs_avx2(&x[59], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[61]);
+ btf_16_adds_subs_avx2(&x[63], &x[62]);
// stage 4
btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
- btf_16_adds_subs_avx2(x[16], x[17]);
- btf_16_subs_adds_avx2(x[19], x[18]);
- btf_16_adds_subs_avx2(x[20], x[21]);
- btf_16_subs_adds_avx2(x[23], x[22]);
- btf_16_adds_subs_avx2(x[24], x[25]);
- btf_16_subs_adds_avx2(x[27], x[26]);
- btf_16_adds_subs_avx2(x[28], x[29]);
- btf_16_subs_adds_avx2(x[31], x[30]);
- idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
// stage 5
btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[9]);
- btf_16_subs_adds_avx2(x[11], x[10]);
- btf_16_adds_subs_avx2(x[12], x[13]);
- btf_16_subs_adds_avx2(x[15], x[14]);
- idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
// stage 6
btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
- btf_16_adds_subs_avx2(x[4], x[5]);
- btf_16_subs_adds_avx2(x[7], x[6]);
- btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
- btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
- idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
// stage 7
- btf_16_adds_subs_avx2(x[0], x[3]);
- btf_16_adds_subs_avx2(x[1], x[2]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
- btf_16_adds_subs_avx2(x[8], x[11]);
- btf_16_adds_subs_avx2(x[9], x[10]);
- btf_16_subs_adds_avx2(x[15], x[12]);
- btf_16_subs_adds_avx2(x[14], x[13]);
- idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
// stage 8
- btf_16_adds_subs_avx2(x[0], x[7]);
- btf_16_adds_subs_avx2(x[1], x[6]);
- btf_16_adds_subs_avx2(x[2], x[5]);
- btf_16_adds_subs_avx2(x[3], x[4]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
- btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
- idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
// stage 9~11
- idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
- idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
idct64_stage11_avx2(output, x);
}
@@ -1667,7 +1649,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
if (lr_flip) {
for (int j = 0; j < buf_size_w_div16; ++j) {
__m256i temp[16];
- flip_buf_av2(buf0 + 16 * j, temp, 16);
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
}
@@ -1693,18 +1675,18 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
int txw_idx, int rect_type) {
const int32_t *input_row = input;
const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
- const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
- (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
const __m256i one = _mm256_set1_epi16(1);
- const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+ const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
if (rect_type != 1 && rect_type != -1) {
for (int i = 0; i < height; ++i) {
const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
input_row += stride;
__m256i lo = _mm256_unpacklo_epi16(src, one);
__m256i hi = _mm256_unpackhi_epi16(src, one);
- lo = _mm256_madd_epi16(lo, scale_rounding);
- hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
out[i] = _mm256_packs_epi32(lo, hi);
@@ -1718,8 +1700,8 @@ static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
input_row += stride;
__m256i lo = _mm256_unpacklo_epi16(src, one);
__m256i hi = _mm256_unpackhi_epi16(src, one);
- lo = _mm256_madd_epi16(lo, scale_rounding);
- hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
out[i] = _mm256_packs_epi32(lo, hi);
@@ -1731,10 +1713,10 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
__m256i *buf, int shift, int height,
int txh_idx) {
const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
- const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
- const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
const __m256i one = _mm256_set1_epi16(1);
- const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
for (int h = 0; h < height; ++h) {
__m256i lo = _mm256_unpacklo_epi16(buf[h], one);
__m256i hi = _mm256_unpackhi_epi16(buf[h], one);
@@ -1742,8 +1724,8 @@ static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
hi = _mm256_madd_epi16(hi, scale_coeff);
lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
- lo = _mm256_add_epi32(lo, shift_rounding);
- hi = _mm256_add_epi32(hi, shift_rounding);
+ lo = _mm256_add_epi32(lo, shift__r);
+ hi = _mm256_add_epi32(hi, shift__r);
lo = _mm256_srai_epi32(lo, -shift);
hi = _mm256_srai_epi32(hi, -shift);
const __m256i x = _mm256_packs_epi32(lo, hi);
@@ -1856,7 +1838,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
if (lr_flip) {
for (int j = 0; j < buf_size_w_div16; ++j) {
__m256i temp[16];
- flip_buf_av2(buf0 + 16 * j, temp, 16);
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
transpose_16bit_16x16_avx2(temp,
_buf1 + 16 * (buf_size_w_div16 - 1 - j));
}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index c17f655c5..7b5b29cf8 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -19,37 +19,12 @@
#include "aom/aom_integer.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
#ifdef __cplusplus
extern "C" {
#endif
-#define pair_set_w16_epi16(a, b) \
- _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
- { \
- __m256i t0 = _mm256_unpacklo_epi16(in0, in1); \
- __m256i t1 = _mm256_unpackhi_epi16(in0, in1); \
- __m256i u0 = _mm256_madd_epi16(t0, w0); \
- __m256i u1 = _mm256_madd_epi16(t1, w0); \
- __m256i v0 = _mm256_madd_epi16(t0, w1); \
- __m256i v1 = _mm256_madd_epi16(t1, w1); \
- \
- __m256i a0 = _mm256_add_epi32(u0, __rounding); \
- __m256i a1 = _mm256_add_epi32(u1, __rounding); \
- __m256i b0 = _mm256_add_epi32(v0, __rounding); \
- __m256i b1 = _mm256_add_epi32(v1, __rounding); \
- \
- __m256i c0 = _mm256_srai_epi32(a0, cos_bit); \
- __m256i c1 = _mm256_srai_epi32(a1, cos_bit); \
- __m256i d0 = _mm256_srai_epi32(b0, cos_bit); \
- __m256i d1 = _mm256_srai_epi32(b1, cos_bit); \
- \
- out0 = _mm256_packs_epi32(c0, c1); \
- out1 = _mm256_packs_epi32(d0, d1); \
- }
-
// half input is zero
#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
{ \
@@ -60,111 +35,6 @@ extern "C" {
out1 = _mm256_mulhrs_epi16(_in, _w1); \
}
-#define btf_16_adds_subs_avx2(in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- in0 = _mm256_adds_epi16(_in0, _in1); \
- in1 = _mm256_subs_epi16(_in0, _in1); \
- }
-
-#define btf_16_subs_adds_avx2(in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- in1 = _mm256_subs_epi16(_in0, _in1); \
- in0 = _mm256_adds_epi16(_in0, _in1); \
- }
-
-#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
- { \
- const __m256i _in0 = in0; \
- const __m256i _in1 = in1; \
- out0 = _mm256_adds_epi16(_in0, _in1); \
- out1 = _mm256_subs_epi16(_in0, _in1); \
- }
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
- const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
- const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
- return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
- int stride, __m256i *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
- }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
- __m256i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f
- // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f
- // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f
- // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f
- // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f
- // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f
- // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f
- // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f
- // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f
- // to:
- // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- // ...
- __m256i a[16];
- for (int i = 0; i < 16; i += 2) {
- a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
- a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
- }
- __m256i b[16];
- for (int i = 0; i < 16; i += 2) {
- b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
- b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
- }
- __m256i c[16];
- for (int i = 0; i < 16; i += 2) {
- c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
- c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
- }
- out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
- out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
- out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
- out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
- out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
- out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
- out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
- out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
- out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
- out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
- out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
- out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
- out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
- out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
- out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
- out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
- if (bit < 0) {
- __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_mulhrs_epi16(in[i], scale);
- }
- } else if (bit > 0) {
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_slli_epi16(in[i], bit);
- }
- }
-}
-
static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
int size) {
const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
@@ -173,12 +43,6 @@ static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
}
}
-static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
- for (int i = 0; i < size; ++i) {
- out[size - i - 1] = in[i];
- }
-}
-
static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
__m128i pred = _mm_loadu_si128((__m128i const *)(output));
__m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
@@ -197,9 +61,6 @@ static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
}
}
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
- int8_t cos_bit);
-
void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, TX_SIZE tx_size,
int eob);
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
index cccc62f03..90b9879cc 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include "config/aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index faf7251fa..367e02096 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#ifndef AV1_TXFM_SSE4_H_
#define AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index fd5e90a2e..1099144fe 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -24,8 +24,8 @@
void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -46,10 +46,10 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
@@ -180,8 +180,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index fc0e65453..637f83cf7 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -21,8 +21,8 @@
void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -46,7 +46,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -112,7 +112,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -239,8 +239,8 @@ static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
@@ -357,8 +357,8 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index 6fdfb0954..0e91ea947 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -19,8 +19,8 @@
void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int i, j;
@@ -176,8 +176,8 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int i, j;
@@ -187,10 +187,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
__m256i filt[4], coeffs[4];
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index 18fe9ae5a..f66dee37d 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -23,7 +23,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m128i *const coeffs /* [4] */) {
const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
+ filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
@@ -78,8 +78,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -239,8 +239,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index a34c618d0..8444ffa93 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -23,8 +23,8 @@
void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4,
const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
@@ -222,8 +222,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_avx2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
index bdf813fa0..15f8872c1 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -73,8 +73,8 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_sse2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index 5d2fc465e..eb340523a 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -24,8 +24,8 @@
void av1_highbd_jnt_convolve_2d_copy_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -169,8 +169,8 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
void av1_highbd_jnt_convolve_2d_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
@@ -207,7 +207,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -274,7 +274,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index a9cf6a4d6..33183fdee 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -20,13 +20,11 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
int im_h = h + filter_params_y->taps - 1;
int im_stride = 8;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 89d0ecb1e..608bd88a4 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -25,8 +25,8 @@
void av1_highbd_jnt_convolve_2d_copy_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -224,13 +224,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
}
}
-void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_2d_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -459,13 +457,11 @@ void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_x_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -628,13 +624,11 @@ void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_y_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
index ccca6b07a..1a29985b5 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -19,8 +19,8 @@
void av1_highbd_jnt_convolve_y_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -259,8 +259,8 @@ void av1_highbd_jnt_convolve_y_sse4_1(
void av1_highbd_jnt_convolve_x_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index ac1d2c9ca..d1ea26290 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -23,8 +23,8 @@
void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -53,10 +53,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
assert(bits >= 0);
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
@@ -126,8 +126,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -389,8 +389,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -422,10 +422,10 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
@@ -581,8 +581,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
index 4df7bd42e..87dc3242e 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -18,8 +18,8 @@
void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -152,8 +152,8 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
index e4d51ac8d..822772782 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -18,8 +18,8 @@
void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -56,7 +56,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -124,7 +124,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index a42c94028..c64150b9d 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <smmintrin.h>
#include "config/aom_config.h"