summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-17 05:59:08 -0500
committertrav90 <travawine@palemoon.org>2018-10-17 05:59:08 -0500
commitdf9477dfa60ebb5d31bc142e58ce46535c17abce (patch)
treec4fdd5d1b09d08c0514f208246260fc87372cb56 /third_party/aom/av1/common/x86
parent0cc51bc106250988cc3b89cb5d743a5af52cd35a (diff)
downloadUXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.gz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.lz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.xz
UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.zip
Update aom to slightly newer commit ID
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_ssse3.c15
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c28
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c26
-rw-r--r--third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c8
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c222
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c384
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h11
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c344
-rw-r--r--third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c74
-rw-r--r--third_party/aom/av1/common/x86/idct_intrin_sse2.c8
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse2.c369
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_ssse3.c508
12 files changed, 1319 insertions, 678 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
index 91102bbaf..5e627ebcf 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -19,13 +19,13 @@
#define WIDTH_BOUND (16)
#define HEIGHT_BOUND (16)
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
DECLARE_ALIGNED(16, static int8_t,
sub_pel_filters_12sharp_signal_dir[15][2][16]);
DECLARE_ALIGNED(16, static int8_t,
sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
-#endif // CONFIG_DUAL_FILTER
+#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
#if USE_TEMPORALFILTER_12TAP
DECLARE_ALIGNED(16, static int8_t,
@@ -39,7 +39,7 @@ typedef int8_t (*SubpelFilterCoeffs)[16];
static INLINE SubpelFilterCoeffs
get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
if (p.interp_filter == MULTITAP_SHARP) {
return &sub_pel_filters_12sharp_signal_dir[index][0];
}
@@ -56,7 +56,7 @@ get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
static INLINE SubpelFilterCoeffs
get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
if (p.interp_filter == MULTITAP_SHARP) {
return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
}
@@ -143,6 +143,7 @@ static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
const __m128i k_256 = _mm_set1_epi16(1 << 8);
const __m128i zero = _mm_setzero_si128();
+ assert(tapsNum == 10 || tapsNum == 12);
if (10 == tapsNum) {
src -= 1;
}
@@ -470,6 +471,7 @@ static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
__m128i min_x2x3, max_x2x3;
__m128i temp;
+ assert(tapsNum == 10 || tapsNum == 12);
if (tapsNum == 10) {
src_ptr -= 1;
}
@@ -612,6 +614,7 @@ static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
__m128i x0, x1, x2, x3, x4, x5;
__m128i min_x2x3, max_x2x3, temp;
+ assert(tapsNum == 10 || tapsNum == 12);
if (tapsNum == 10) {
src_ptr -= 1;
}
@@ -982,7 +985,7 @@ typedef struct SimdFilter {
int8_t (*simd_vert_filter)[6][16];
} SimdFilter;
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
#define MULTITAP_FILTER_NUM 1
SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
{ MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
@@ -1010,7 +1013,7 @@ void av1_lowbd_convolve_init_ssse3(void) {
temporal_simd_filter.simd_vert_filter);
}
#endif
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
{
int i;
for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
index d04b667f1..97d2e74b1 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -57,7 +57,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 2
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
@@ -94,7 +94,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 3
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
@@ -131,7 +131,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 4
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
@@ -168,7 +168,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 5
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
buf1[1], bit);
btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
@@ -205,7 +205,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 6
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -242,7 +242,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 7
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf1[0] = buf0[0];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
@@ -279,7 +279,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 8
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -383,7 +383,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
// stage 2
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
bit);
btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
@@ -399,7 +399,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
// stage 4
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
@@ -475,7 +475,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 2
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
bit);
btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
@@ -547,7 +547,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 4
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -619,7 +619,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 6
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -691,7 +691,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 8
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
@@ -763,7 +763,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
// stage 10
stage_idx++;
bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
+ cospi = cospi_arr(bit);
buf0[0] = buf1[0];
buf0[1] = buf1[1];
btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 78c261374..1d7c55349 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -37,16 +37,20 @@ static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
}
static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
- const int stride, const TXFM_2D_CFG *cfg,
+ const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
int32_t *txfm_buf) {
- const int txfm_size = cfg->txfm_size;
- const int8_t *shift = cfg->shift;
- const int8_t *stage_range_col = cfg->stage_range_col;
- const int8_t *stage_range_row = cfg->stage_range_row;
- const int8_t *cos_bit_col = cfg->cos_bit_col;
- const int8_t *cos_bit_row = cfg->cos_bit_row;
- const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
- const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+ // TODO(sarahparker) must correct for rectangular transforms in follow up
+ const int txfm_size = cfg->row_cfg->txfm_size;
+ const int8_t *shift = cfg->row_cfg->shift;
+ const int8_t *stage_range_col = cfg->col_cfg->stage_range;
+ const int8_t *stage_range_row = cfg->row_cfg->stage_range;
+ const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
+ const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
+ const TxfmFuncSSE2 txfm_func_col =
+ fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
+ const TxfmFuncSSE2 txfm_func_row =
+ fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
__m128i *buf_128 = (__m128i *)txfm_buf;
__m128i *out_128 = (__m128i *)output;
@@ -69,7 +73,7 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
(void)bd;
- fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
}
void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
@@ -77,5 +81,5 @@ void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
(void)bd;
- fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
index cf6249bdc..68461bc36 100644
--- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -15,7 +15,7 @@
#include "./av1_rtcd.h"
#include "av1/common/filter.h"
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
#endif
@@ -31,7 +31,7 @@ typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
static INLINE HbdSubpelFilterCoeffs
hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
if (p.interp_filter == MULTITAP_SHARP) {
return &subpel_filters_sharp[index][0];
}
@@ -76,7 +76,7 @@ void av1_highbd_convolve_init_sse4_1(void) {
init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
}
#endif
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
{
InterpFilterParams filter_params =
av1_get_interp_filter_params(MULTITAP_SHARP);
@@ -246,6 +246,7 @@ static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
int tapsNum, uint32_t *buf) {
__m128i u[8], v[6];
+ assert(tapsNum == 10 || tapsNum == 12);
if (tapsNum == 10) {
src -= 1;
}
@@ -412,6 +413,7 @@ static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
int r = 0;
// TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+ assert(taps == 10 || taps == 12);
if (10 == taps) {
i += 1;
s[0] = zero;
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index d10f1ccc2..dd2a681bc 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -13,7 +13,7 @@
#include "./av1_rtcd.h"
#include "./aom_config.h"
-#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
// Note:
// Total 32x4 registers to represent 32x32 block coefficients.
@@ -154,20 +154,21 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
}
}
-static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1,
- __m256i n1, __m256i rounding, int bit) {
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
__m256i x, y;
- x = _mm256_mullo_epi32(w0, n0);
- y = _mm256_mullo_epi32(w1, n1);
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
x = _mm256_add_epi32(x, y);
- x = _mm256_add_epi32(x, rounding);
+ x = _mm256_add_epi32(x, *rounding);
x = _mm256_srai_epi32(x, bit);
return x;
}
static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
@@ -275,22 +276,38 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[13] = bf1[13];
bf0[14] = bf1[14];
bf0[15] = bf1[15];
- bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit);
- bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit);
- bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit);
- bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit);
- bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit);
- bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit);
- bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit);
- bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit);
- bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit);
- bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit);
- bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit);
- bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit);
- bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit);
- bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit);
+ bf0[16] =
+ half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
// stage 3
bf1[0] = bf0[0];
@@ -301,14 +318,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf1[5] = bf0[5];
bf1[6] = bf0[6];
bf1[7] = bf0[7];
- bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit);
- bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit);
- bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit);
- bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit);
- bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit);
- bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit);
- bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit);
- bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit);
+ bf1[8] =
+ half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
@@ -331,10 +356,13 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[1] = bf1[1];
bf0[2] = bf1[2];
bf0[3] = bf1[3];
- bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit);
- bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit);
- bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit);
- bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit);
+ bf0[4] =
+ half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
@@ -344,40 +372,54 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
bf0[16] = bf1[16];
- bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit);
- bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
bf0[19] = bf1[19];
bf0[20] = bf1[20];
- bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
bf0[22] =
- half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit);
+ half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
bf0[23] = bf1[23];
bf0[24] = bf1[24];
- bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
bf0[27] = bf1[27];
bf0[28] = bf1[28];
- bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit);
- bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
bf0[31] = bf1[31];
// stage 5
- bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit);
- bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit);
- bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit);
- bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit);
+ bf1[0] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
bf1[8] = bf0[8];
- bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
bf1[10] =
- half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit);
+ half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
bf1[11] = bf0[11];
bf1[12] = bf0[12];
- bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit);
- bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
bf1[15] = bf0[15];
bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
@@ -402,8 +444,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
bf0[4] = bf1[4];
- bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit);
- bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
bf0[7] = bf1[7];
bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
@@ -415,20 +459,26 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
bf0[16] = bf1[16];
bf0[17] = bf1[17];
- bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit);
- bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
bf0[20] =
- half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit);
+ half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
bf0[21] =
- half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit);
+ half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
bf0[22] = bf1[22];
bf0[23] = bf1[23];
bf0[24] = bf1[24];
bf0[25] = bf1[25];
- bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit);
- bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit);
- bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
bf0[30] = bf1[30];
bf0[31] = bf1[31];
@@ -443,10 +493,14 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
bf1[8] = bf0[8];
bf1[9] = bf0[9];
- bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit);
- bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit);
- bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit);
- bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
@@ -487,14 +541,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[17] = bf1[17];
bf0[18] = bf1[18];
bf0[19] = bf1[19];
- bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit);
- bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit);
- bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit);
- bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit);
- bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit);
- bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit);
- bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit);
- bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
bf0[28] = bf1[28];
bf0[29] = bf1[29];
bf0[30] = bf1[30];
@@ -539,18 +601,20 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
int stride, int tx_type, int bd) {
__m256i in[128], out[128];
- const TXFM_2D_CFG *cfg = NULL;
+ const TXFM_1D_CFG *row_cfg = NULL;
+ const TXFM_1D_CFG *col_cfg = NULL;
switch (tx_type) {
case DCT_DCT:
- cfg = &inv_txfm_2d_cfg_dct_dct_32;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_32;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_32;
load_buffer_32x32(coeff, in);
transpose_32x32(in, out);
- idct32_avx2(out, in, cfg->cos_bit_row[2]);
- round_shift_32x32(in, -cfg->shift[0]);
+ idct32_avx2(out, in, row_cfg->cos_bit[2]);
+ round_shift_32x32(in, -row_cfg->shift[0]);
transpose_32x32(in, out);
- idct32_avx2(out, in, cfg->cos_bit_col[2]);
- write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct32_avx2(out, in, col_cfg->cos_bit[2]);
+ write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
default: assert(0);
}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 24b2760b9..a93699f0b 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -13,7 +13,7 @@
#include "./av1_rtcd.h"
#include "./aom_config.h"
-#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
@@ -24,7 +24,7 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
}
static void idct4x4_sse4_1(__m128i *in, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -72,7 +72,7 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
}
static void iadst4x4_sse4_1(__m128i *in, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
@@ -232,72 +232,82 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, int tx_type, int bd) {
__m128i in[4];
- const TXFM_2D_CFG *cfg = NULL;
+ const TXFM_1D_CFG *row_cfg = NULL;
+ const TXFM_1D_CFG *col_cfg = NULL;
switch (tx_type) {
case DCT_DCT:
- cfg = &inv_txfm_2d_cfg_dct_dct_4;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
- idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_4;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case DCT_ADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_4;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
case DCT_FLIPADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case FLIPADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
break;
case ADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case FLIPADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_4;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_4;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
- write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+ iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+ write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
#endif // CONFIG_EXT_TX
default: assert(0);
@@ -325,7 +335,7 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
}
static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
@@ -439,7 +449,7 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
}
static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
@@ -698,90 +708,100 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, int tx_type, int bd) {
__m128i in[16], out[16];
- const TXFM_2D_CFG *cfg = NULL;
+ const TXFM_1D_CFG *row_cfg = NULL;
+ const TXFM_1D_CFG *col_cfg = NULL;
switch (tx_type) {
case DCT_DCT:
- cfg = &inv_txfm_2d_cfg_dct_dct_8;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case DCT_ADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_8;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_8;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
case DCT_FLIPADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case ADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case FLIPADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
break;
case FLIPADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_8;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_8;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+ iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
#endif // CONFIG_EXT_TX
default: assert(0);
@@ -849,7 +869,7 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
}
static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
@@ -907,24 +927,24 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[6] = u[6];
v[7] = u[7];
- v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
- v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
- v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
- v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
- v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
- v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
- v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
- v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
// stage 3
u[0] = v[0];
u[1] = v[1];
u[2] = v[2];
u[3] = v[3];
- u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
- u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
- u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
- u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+ u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
u[8] = _mm_add_epi32(v[8], v[9]);
u[9] = _mm_sub_epi32(v[8], v[9]);
u[10] = _mm_sub_epi32(v[11], v[10]);
@@ -945,19 +965,19 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[1] = _mm_add_epi32(v[1], rnding);
v[1] = _mm_srai_epi32(v[1], bit);
- v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
- v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
v[4] = _mm_add_epi32(u[4], u[5]);
v[5] = _mm_sub_epi32(u[4], u[5]);
v[6] = _mm_sub_epi32(u[7], u[6]);
v[7] = _mm_add_epi32(u[6], u[7]);
v[8] = u[8];
- v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
- v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
v[11] = u[11];
v[12] = u[12];
- v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
- v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
v[15] = u[15];
// stage 5
@@ -1043,7 +1063,7 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
}
static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
- const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -1183,18 +1203,18 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
- v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
- v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
- v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
- v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
v[8] = u[8];
v[9] = u[9];
v[10] = u[10];
v[11] = u[11];
- v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
- v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
- v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
- v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
// stage 5
u[0] = _mm_add_epi32(v[0], v[4]);
@@ -1223,14 +1243,14 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[5] = u[5];
v[6] = u[6];
v[7] = u[7];
- v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
- v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
- v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
- v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
- v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
- v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
- v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
- v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
// stage 7
u[0] = _mm_add_epi32(v[0], v[8]);
@@ -1251,22 +1271,22 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
u[15] = _mm_sub_epi32(v[7], v[15]);
// stage 8
- v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
- v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
- v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
- v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
- v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
- v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
- v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
- v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
- v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
- v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
- v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
- v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
- v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
- v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
- v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
- v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+ v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
// stage 9
out[0 * 4 + col] = v[1];
@@ -1298,99 +1318,109 @@ static void round_shift_16x16(__m128i *in, int shift) {
void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, int tx_type, int bd) {
__m128i in[64], out[64];
- const TXFM_2D_CFG *cfg = NULL;
+ const TXFM_1D_CFG *row_cfg = NULL;
+ const TXFM_1D_CFG *col_cfg = NULL;
switch (tx_type) {
case DCT_DCT:
- cfg = &inv_txfm_2d_cfg_dct_dct_16;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case DCT_ADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_16;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
case ADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
- cfg = &inv_txfm_2d_cfg_adst_dct_16;
+ row_cfg = &inv_txfm_1d_row_cfg_dct_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
case DCT_FLIPADST:
- cfg = &inv_txfm_2d_cfg_dct_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case ADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
break;
case FLIPADST_FLIPADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
break;
case FLIPADST_ADST:
- cfg = &inv_txfm_2d_cfg_adst_adst_16;
+ row_cfg = &inv_txfm_1d_row_cfg_adst_16;
+ col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
- round_shift_16x16(in, -cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
+ round_shift_16x16(in, -row_cfg->shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
- write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
+ write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
break;
#endif
default: assert(0);
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index bc96defe3..fb246674a 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -77,14 +77,15 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
// Note:
// rounding = 1 << (bit - 1)
-static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
- __m128i n1, __m128i rounding, int bit) {
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *w1, const __m128i *n1,
+ const __m128i *rounding, int bit) {
__m128i x, y;
- x = _mm_mullo_epi32(w0, n0);
- y = _mm_mullo_epi32(w1, n1);
+ x = _mm_mullo_epi32(*w0, *n0);
+ y = _mm_mullo_epi32(*w1, *n1);
x = _mm_add_epi32(x, y);
- x = _mm_add_epi32(x, rounding);
+ x = _mm_add_epi32(x, *rounding);
x = _mm_srai_epi32(x, bit);
return x;
}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
index c25db88b7..37e2f61e7 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -14,16 +14,13 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
-static const __m128i *const filter = (const __m128i *const)warped_filter;
-
-/* SSE2 version of the rotzoom/affine warp filter */
-void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
- int height, int stride, uint16_t *pred,
- int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
- int subsampling_y, int bd, int ref_frm,
- int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta) {
+void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ int comp_avg, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta) {
#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
#else
@@ -47,23 +44,21 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
for (j = 0; j < p_width; j += 8) {
// (x, y) coordinates of the center of this block in the destination
// image
- int32_t dst_x = p_col + j + 4;
- int32_t dst_y = p_row + i + 4;
+ const int32_t dst_x = p_col + j + 4;
+ const int32_t dst_y = p_row + i + 4;
int32_t x4, y4, ix4, sx4, iy4, sy4;
if (subsampling_x)
- x4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
+ x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
else
x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
if (subsampling_y)
- y4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
+ y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
else
y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
@@ -72,71 +67,103 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
// Horizontal filter
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
-
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- } else if (ix4 >= width + 6) {
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- } else {
- int sx = sx4 + alpha * (-4) + beta * k +
- // Include rounding and offset here
- (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
// Load source pixels
- __m128i src =
+ const __m128i src =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- __m128i src2 =
+ const __m128i src2 =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
// Filter even-index pixels
- __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
- __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
// coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
- __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
// coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
- __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
// coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
- __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
- __m128i round_const =
- _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+ ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
// Calculate filtered results
- __m128i res_0 = _mm_madd_epi16(src, coeff_0);
- __m128i res_2 =
+ const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+ const __m128i res_2 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
- __m128i res_4 =
+ const __m128i res_4 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
- __m128i res_6 =
+ const __m128i res_6 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
@@ -145,28 +172,36 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
- __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
-
- __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- __m128i res_1 =
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
- __m128i res_3 =
+ const __m128i res_3 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
- __m128i res_5 =
+ const __m128i res_5 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
- __m128i res_7 =
+ const __m128i res_7 =
_mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
@@ -183,101 +218,118 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + gamma * (-4) + delta * k +
- (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ int sy = sy4 + delta * (k + 4);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- __m128i *src = tmp + (k + 4);
- __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
// Filter even-index pixels
- __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
-
- __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
// Filter odd-index pixels
- __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
- __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
-
- __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
// Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
// Round and pack into 8 bits
- __m128i round_const =
- _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+ ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
- __m128i res_lo_round = _mm_srai_epi32(
+ const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
- __m128i res_hi_round = _mm_srai_epi32(
+ const __m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
- __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
- __m128i zero = _mm_setzero_si128();
+ const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i zero = _mm_setzero_si128();
res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
// Store, blending with 'pred' if needed
- __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
// Note: If we're outputting a 4x4 block, we need to be very careful
// to only output 4 pixels at this point, to avoid encode/decode
// mismatches when encoding with multiple threads.
if (p_width == 4) {
- if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
+ if (comp_avg)
+ res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
_mm_storel_epi64(p, res_16bit);
} else {
- if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
+ if (comp_avg)
+ res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
_mm_storeu_si128(p, res_16bit);
}
}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
index efc8d1e24..c69614e42 100644
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -14,67 +14,9 @@
#include "./aom_config.h"
#include "./av1_rtcd.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-#if CONFIG_HIGHBITDEPTH
- *in = _mm256_setr_epi16(
- (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
- (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
- (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
- (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
- (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
- (int16_t)coeff[15]);
-#else
- *in = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
-}
-
-static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
- int i = 0;
- while (i < 16) {
- load_coeff(coeff + (i << 4), &in[i]);
- i += 1;
- }
-}
-
-static void recon_and_store(const __m256i *res, uint8_t *output) {
- const __m128i zero = _mm_setzero_si128();
- __m128i x = _mm_loadu_si128((__m128i const *)output);
- __m128i p0 = _mm_unpacklo_epi8(x, zero);
- __m128i p1 = _mm_unpackhi_epi8(x, zero);
-
- p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
- p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
- x = _mm_packus_epi16(p0, p1);
- _mm_storeu_si128((__m128i *)output, x);
-}
-
-#define IDCT_ROUNDING_POS (6)
-
-static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
- const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
- int i = 0;
-
- while (i < 16) {
- in[i] = _mm256_add_epi16(in[i], rounding);
- in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
- recon_and_store(&in[i], output + i * stride);
- i += 1;
- }
-}
-
-static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
- const __m256i *c0, const __m256i *c1,
- __m256i *b0, __m256i *b1) {
- __m256i x0, x1;
- x0 = _mm256_unpacklo_epi16(*a0, *a1);
- x1 = _mm256_unpackhi_epi16(*a0, *a1);
- *b0 = butter_fly(x0, x1, *c0);
- *b1 = butter_fly(x0, x1, *c1);
-}
+#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-static void idct16_avx2(__m256i *in) {
+void av1_idct16_avx2(__m256i *in) {
const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
@@ -216,8 +158,8 @@ static void idct16_avx2(__m256i *in) {
}
static void idct16(__m256i *in) {
- mm256_transpose_16x16(in);
- idct16_avx2(in);
+ mm256_transpose_16x16(in, in);
+ av1_idct16_avx2(in);
}
static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
@@ -398,7 +340,7 @@ static void iadst16_avx2(__m256i *in) {
}
static void iadst16(__m256i *in) {
- mm256_transpose_16x16(in);
+ mm256_transpose_16x16(in, in);
iadst16_avx2(in);
}
@@ -416,8 +358,8 @@ static void flip_col(uint8_t **dest, int *stride, int rows) {
}
static void iidtx16(__m256i *in) {
- mm256_transpose_16x16(in);
- txfm_scaling16_avx2(Sqrt2, in);
+ mm256_transpose_16x16(in, in);
+ txfm_scaling16_avx2((int16_t)Sqrt2, in);
}
#endif
@@ -503,5 +445,5 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
#endif // CONFIG_EXT_TX
default: assert(0); break;
}
- write_buffer_16x16(in, stride, dest);
+ store_buffer_16xN(in, stride, dest, 16);
}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
index 522e8988c..d6a598746 100644
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -17,14 +17,14 @@
#include "av1/common/enums.h"
#if CONFIG_EXT_TX
-static INLINE void fliplr_4x4(__m128i in[2]) {
+static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
}
-static INLINE void fliplr_8x8(__m128i in[8]) {
+static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
@@ -36,7 +36,7 @@ static INLINE void fliplr_8x8(__m128i in[8]) {
in[7] = mm_reverse_epi16(in[7]);
}
-static INLINE void fliplr_16x8(__m128i in[16]) {
+static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
fliplr_8x8(&in[0]);
fliplr_8x8(&in[8]);
}
@@ -356,7 +356,7 @@ static void iidtx8_sse2(__m128i *in) {
}
static INLINE void iidtx4_sse2(__m128i *in) {
- const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+ const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
index 925e4650d..cdc4e8d0f 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -14,17 +14,15 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
-static const __m128i *const filter = (const __m128i *const)warped_filter;
-
-/* SSE2 version of the rotzoom/affine warp filter */
-void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
- int stride, uint8_t *pred, int p_col, int p_row,
- int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y, int ref_frm,
+void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int comp_avg,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
__m128i tmp[15];
int i, j, k;
+ const int bd = 8;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -42,23 +40,21 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
for (j = 0; j < p_width; j += 8) {
// (x, y) coordinates of the center of this block in the destination
// image
- int32_t dst_x = p_col + j + 4;
- int32_t dst_y = p_row + i + 4;
+ const int32_t dst_x = p_col + j + 4;
+ const int32_t dst_y = p_row + i + 4;
int32_t x4, y4, ix4, sx4, iy4, sy4;
if (subsampling_x)
- x4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
+ x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
else
x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
if (subsampling_y)
- y4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
+ y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
else
y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
@@ -67,76 +63,104 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
// Horizontal filter
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
-
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- } else if (ix4 >= width + 6) {
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- } else {
- int sx = sx4 + alpha * (-4) + beta * k +
- // Include rounding and offset here
- (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
// Load source pixels
- __m128i zero = _mm_setzero_si128();
- __m128i src =
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i src =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
// Filter even-index pixels
- __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
- __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
// coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
- __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
// coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
- __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
// coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
- __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
- __m128i round_const =
- _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+ ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
// Calculate filtered results
- __m128i src_0 = _mm_unpacklo_epi8(src, zero);
- __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
- __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
- __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
- __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+ const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
_mm_add_epi32(res_2, res_6));
@@ -144,33 +168,37 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
- __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
- __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
- __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
- __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
- __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
_mm_add_epi32(res_3, res_7));
@@ -186,109 +214,116 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + gamma * (-4) + delta * k +
- (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ int sy = sy4 + delta * (k + 4);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- __m128i *src = tmp + (k + 4);
- __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
// Filter even-index pixels
- __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
// Filter odd-index pixels
- __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
// Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
// Round and pack into 8 bits
- __m128i round_const =
- _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+ ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
- __m128i res_lo_round = _mm_srai_epi32(
+ const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
- __m128i res_hi_round = _mm_srai_epi32(
+ const __m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
- __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
__m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
// Store, blending with 'pred' if needed
- __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
// Note: If we're outputting a 4x4 block, we need to be very careful
// to only output 4 pixels at this point, to avoid encode/decode
// mismatches when encoding with multiple threads.
if (p_width == 4) {
- if (ref_frm) {
+ if (comp_avg) {
const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
res_8bit = _mm_avg_epu8(res_8bit, orig);
}
*(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
} else {
- if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+ if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
_mm_storel_epi64(p, res_8bit);
}
}
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
new file mode 100644
index 000000000..494410e99
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+ filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+
+#else
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
+ // dummy (replicate row index 95)
+ { 0, 0, 4, -3, 0, -1, 127, 1},
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 0 };
+static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
+ 9, 11, 11, 13, 13, 15, 15, 0 };
+
+void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int comp_avg,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int bd = 8;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ // (x, y) coordinates of the center of this block in the destination
+ // image
+ const int32_t dst_x = p_col + j + 4;
+ const int32_t dst_y = p_row + i + 4;
+
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
+ else
+ x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
+
+ if (subsampling_y)
+ y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+ 4;
+ else
+ y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+ 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src_even =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+ const __m128i src_odd =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64((
+ __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(
+ _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+
+ const __m128i round_const =
+ _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+ ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
+
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srli_epi16(res, HORSHEAR_REDUCE_PREC_BITS);
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ // Round and pack into 8 bits
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+ ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ if (comp_avg) {
+ const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
+ res_8bit = _mm_avg_epu8(res_8bit, orig);
+ }
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+ }
+}