summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c')
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c75
1 files changed, 67 insertions, 8 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index abb95f31e..8ec0256eb 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@
#include "av1/common/enums.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
@@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
}
}
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+ col_num);
+ }
+}
+
static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
default: assert(0);
}
return NULL;
@@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
transpose_32(txfm_size, buf_128, out_128);
}
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+ col_num, (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32x32(buf_128, out_128);
+}
+
void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
@@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
}
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
const __m128i *inputB, __m128i *output) {
__m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
@@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
@@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < width_div8; ++j) {
@@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
}
av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < (32 / 4); ++j) {