From 68569dee1416593955c1570d638b3d9250b33012 Mon Sep 17 00:00:00 2001 From: trav90 Date: Mon, 15 Oct 2018 21:45:30 -0500 Subject: Import aom library This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36. --- third_party/aom/av1/encoder/mips/msa/error_msa.c | 108 +++++ .../aom/av1/encoder/mips/msa/fdct16x16_msa.c | 436 +++++++++++++++++++++ third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c | 98 +++++ third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c | 65 +++ third_party/aom/av1/encoder/mips/msa/fdct_msa.h | 117 ++++++ .../aom/av1/encoder/mips/msa/temporal_filter_msa.c | 284 ++++++++++++++ 6 files changed, 1108 insertions(+) create mode 100644 third_party/aom/av1/encoder/mips/msa/error_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct_msa.h create mode 100644 third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c (limited to 'third_party/aom/av1/encoder/mips/msa') diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c new file mode 100644 index 000000000..8d13af7ad --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ + static int64_t block_error_##BSize##size_msa( \ + const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \ + sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ + } + +/* clang-format off */ +BLOCK_ERROR_BLOCKSIZE_MSA(16) +BLOCK_ERROR_BLOCKSIZE_MSA(64) +BLOCK_ERROR_BLOCKSIZE_MSA(256) +BLOCK_ERROR_BLOCKSIZE_MSA(1024) +/* clang-format on */ + +int64_t av1_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, intptr_t blk_size, + int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break; + case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break; + case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break; + case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break; + default: + err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c new file mode 100644 index 000000000..4b0364d6c --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" +#include "aom_dsp/mips/fwd_txfm_msa.h" + +static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, + const int32_t *const0, int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r15 = LD_SH(input + 15 * stride); + r7 = LD_SH(input + 7 * stride); + r8 = LD_SH(input + 8 * stride); + SLLI_4V(r0, r15, r7, r8, 2); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 8, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * stride); + r4 = LD_SH(input + 4 * stride); + r11 = LD_SH(input + 11 * stride); + r12 = LD_SH(input + 12 * stride); + SLLI_4V(r3, r4, r11, r12, 2); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp2, int_buf, 8); + ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + ST_SH2(h0, h1, int_buf + 8 * 8, 8); + ST_SH2(h3, h2, int_buf + 12 * 8, 8); + + r9 = LD_SH(input + 9 * stride); + r6 = LD_SH(input + 6 * stride); + r1 = LD_SH(input + stride); + r14 = LD_SH(input + 14 * stride); + SLLI_4V(r9, r6, r1, r14, 2); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r13 = LD_SH(input + 13 * stride); + r2 = LD_SH(input + 2 * stride); + r5 = LD_SH(input + 5 * stride); + r10 = LD_SH(input + 10 * stride); + SLLI_4V(r13, r2, r5, r10, 2); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0, + int16_t *out, int16_t *out_ptr) { + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); + LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + fadst16_step2_msa_helper(int_buf, const0, out, out + 128); +} + +static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); + out += 64; + + /* load input data */ + input += 128; + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); +} + +static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, + int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r7 = LD_SH(input + 7 * 8); + r8 = LD_SH(input + 8 * 8); + r15 = LD_SH(input + 15 * 8); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 4 * 2, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * 8); + r4 = LD_SH(input + 4 * 8); + r11 = LD_SH(input + 11 * 8); + r12 = LD_SH(input + 12 * 8); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp1, int_buf, 4 * 8); + ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); + ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); + + r1 = LD_SH(input + 8); + r6 = LD_SH(input + 6 * 8); + r9 = LD_SH(input + 9 * 8); + r14 = LD_SH(input + 14 * 8); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r2 = LD_SH(input + 2 * 8); + r5 = LD_SH(input + 5 * 8); + r10 = LD_SH(input + 10 * 8); + r13 = LD_SH(input + 13 * 8); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + fadst16_step2_msa_helper(int_buf, const0, out, out + 8); +} + +static void fadst16_transpose_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); + out += 16 * 8; + + /* load input data */ + input += 128; + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); +} + +static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { + int16_t *temp = intermediate; + int16_t *out = output; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; + v8i16 in12, in13, in14, in15; + + LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); + temp = intermediate + 8; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(in0, in1); + FDCT_POSTPROC_2V_NEG_H(in2, in3); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + FDCT_POSTPROC_2V_NEG_H(in6, in7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + temp = intermediate; + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + temp = intermediate; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + out = output + 8; + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); +} + +void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + DECLARE_ALIGNED(32, int16_t, tmp[256]); + DECLARE_ALIGNED(32, int16_t, trans_buf[256]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); + int32_t i; + int16_t *ptmpbuf = &tmp_buf[0]; + int16_t *trans = &trans_buf[0]; + const int32_t const_arr[29 * 4] = { + 52707308, 52707308, 52707308, 52707308, -1072430300, + -1072430300, -1072430300, -1072430300, 795618043, 795618043, + 795618043, 795618043, -721080468, -721080468, -721080468, + -721080468, 459094491, 459094491, 459094491, 459094491, + -970646691, -970646691, -970646691, -970646691, 1010963856, + 1010963856, 1010963856, 1010963856, -361743294, -361743294, + -361743294, -361743294, 209469125, 209469125, 209469125, + 209469125, -1053094788, -1053094788, -1053094788, -1053094788, + 1053160324, 1053160324, 1053160324, 1053160324, 639644520, + 639644520, 639644520, 639644520, -862444000, -862444000, + -862444000, -862444000, 1062144356, 1062144356, 1062144356, + 1062144356, -157532337, -157532337, -157532337, -157532337, + 260914709, 260914709, 260914709, 260914709, -1041559667, + -1041559667, -1041559667, -1041559667, 920985831, 920985831, + 920985831, 920985831, -551995675, -551995675, -551995675, + -551995675, 596522295, 596522295, 596522295, 596522295, + 892853362, 892853362, 892853362, 892853362, -892787826, + -892787826, -892787826, -892787826, 410925857, 410925857, + 410925857, 410925857, -992012162, -992012162, -992012162, + -992012162, 992077698, 992077698, 992077698, 992077698, + 759246145, 759246145, 759246145, 759246145, -759180609, + -759180609, -759180609, -759180609, -759222975, -759222975, + -759222975, -759222975, 759288511, 759288511, 759288511, + 759288511 + }; + + switch (tx_type) { + case DCT_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case ADST_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case DCT_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + case ADST_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + default: assert(0); break; + } +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c new file mode 100644 index 000000000..da1ac74f0 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" + +void av1_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} + +void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 temp, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + temp = __msa_ceqi_h(in0, 0); + temp = (v8i16)__msa_xori_b((v16u8)temp, 255); + temp = mask & temp; + in0 += temp; + } + + switch (tx_type) { + case DCT_DCT: + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: assert(0); break; + } + + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c new file mode 100644 index 000000000..4cbf60a11 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" + +void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + + switch (tx_type) { + case DCT_DCT: + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_DCT: + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case DCT_ADST: + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_ADST: + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + default: assert(0); break; + } + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h new file mode 100644 index 000000000..52bcf790c --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ +#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ + +#include "aom_dsp/mips/fwd_txfm_msa.h" +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_ports/mem.h" + +#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ + v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ + \ + UNPCK_R_SH_SW(in0, in0_r_m); \ + UNPCK_R_SH_SW(in1, in1_r_m); \ + UNPCK_R_SH_SW(in2, in2_r_m); \ + UNPCK_R_SH_SW(in3, in3_r_m); \ + \ + constant_m = __msa_fill_w(sinpi_4_9); \ + MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ + \ + constant_m = __msa_fill_w(sinpi_1_9); \ + s0_m += in0_r_m * constant_m; \ + s1_m -= in1_r_m * constant_m; \ + \ + constant_m = __msa_fill_w(sinpi_2_9); \ + s0_m += in1_r_m * constant_m; \ + s1_m += in3_r_m * constant_m; \ + \ + s2_m = in0_r_m + in1_r_m - in3_r_m; \ + \ + constant_m = __msa_fill_w(sinpi_3_9); \ + MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ + \ + in0_r_m = s0_m + s3_m; \ + s2_m = s1_m - s3_m; \ + s3_m = s1_m - s0_m + s3_m; \ + \ + SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ + out0, out1, out2, out3); \ + } +#endif // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c new file mode 100644 index 000000000..4ec679642 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, int32_t filt_sth, + int32_t filt_wgt, uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, int32_t filt_wgt, + uint32_t *acc, uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else { + av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} -- cgit v1.2.3