diff options
author | Matt A. Tobin <email@mattatobin.com> | 2020-04-07 23:30:51 -0400 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2020-04-14 13:26:42 +0200 |
commit | 277f2116b6660e9bbe7f5d67524be57eceb49b8b (patch) | |
tree | 4595f7cc71418f71b9a97dfaeb03a30aa60f336a /third_party/aom/aom_dsp/mips | |
parent | d270404436f6e84ffa3b92af537ac721bf10d66e (diff) | |
download | UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.gz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.lz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.xz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.zip |
Move aom source to a sub-directory under media/libaom
There is no damned reason to treat this differently than any other media lib given its license and there never was.
Diffstat (limited to 'third_party/aom/aom_dsp/mips')
34 files changed, 0 insertions, 18141 deletions
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c deleted file mode 100644 index 96d04cff0..000000000 --- a/third_party/aom/aom_dsp/mips/add_noise_msa.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "aom_dsp/mips/macros_msa.h" - -void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, - char blackclamp[16], char whiteclamp[16], - char bothclamp[16], uint32_t width, - uint32_t height, int32_t pitch) { - uint32_t i, j; - - for (i = 0; i < height / 2; ++i) { - uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; - int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); - uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; - int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); - for (j = width / 16; j--;) { - v16i8 temp00_s, temp01_s; - v16u8 temp00, temp01, black_clamp, white_clamp; - v16u8 pos0, ref0, pos1, ref1; - v16i8 const127 = __msa_ldi_b(127); - - pos0 = LD_UB(pos0_ptr); - ref0 = LD_UB(ref0_ptr); - pos1 = LD_UB(pos1_ptr); - ref1 = LD_UB(ref1_ptr); - black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); - white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); - temp00 = (pos0 < black_clamp); - pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); - temp01 = (pos1 < black_clamp); - pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); - XORI_B2_128_UB(pos0, pos1); - temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp00 = (v16u8)(temp00_s < pos0); - pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); - temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp01 = (temp01_s < pos1); - pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); - XORI_B2_128_UB(pos0, pos1); - pos0 += ref0; - ST_UB(pos0, pos0_ptr); - pos1 += ref1; - ST_UB(pos1, pos1_ptr); - pos0_ptr += 16; - pos1_ptr += 16; - ref0_ptr += 16; - ref1_ptr += 16; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c deleted file mode 100644 index 363fad308..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 mask0, mask1, mask2, mask3, out; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v8i16 filt, out0, out1; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1); - SRARI_H2_SH(out0, out1, FILTER_BITS); - SAT_SH2_SH(out0, out1, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 filt0, filt1, filt2, filt3; - v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1); - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1, out2, - out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - tmp0 = PCKEV_XORI128_UB(out0, out1); - tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); -} - -static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - tmp0 = PCKEV_XORI128_UB(out0, out1); - tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_SB2(src, src_stride, src0, src2); - LD_SB2(src + 8, src_stride, src1, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (2 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - dst += dst_stride; - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst); - dst += dst_stride; - } -} - -static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - dst += dst_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - - src0 = LD_SB(src + 32); - src2 = LD_SB(src + 48); - src3 = LD_SB(src + 56); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst + 32); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 48); - dst += dst_stride; - } -} - -static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, vec0, vec1, res0, res1; - v8u16 vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); - SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 vec0, vec1, vec2, vec3, filt0; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16i8 res0, res1, res2, res3; - v8u16 vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, - vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 filt0; - v16i8 src0, src1, src2, src3, mask; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); - ST8x4_UB(src0, src1, dst, dst_stride); -} - -static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - v16u8 filt0; - v16i8 src0, src1, src2, src3, mask, out0, out1; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - if (16 == height) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); - } -} - -static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - loop_cnt = (height >> 2) - 1; - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - dst += dst_stride; - PCKEV_ST_SB(out2, out3, dst); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - dst += dst_stride; - PCKEV_ST_SB(out6, out7, dst); - dst += dst_stride; - - for (; loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - dst += dst_stride; - PCKEV_ST_SB(out2, out3, dst); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - dst += dst_stride; - PCKEV_ST_SB(out6, out7, dst); - dst += dst_stride; - } -} - -static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height >> 1; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - src4 = LD_SB(src); - src6 = LD_SB(src + 16); - src7 = LD_SB(src + 24); - src5 = __msa_sldi_b(src6, src4, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - PCKEV_ST_SB(out2, out3, dst + 16); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - PCKEV_ST_SB(out6, out7, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src4 = LD_SB(src + 32); - src6 = LD_SB(src + 48); - src7 = LD_SB(src + 56); - SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - PCKEV_ST_SB(out2, out3, dst + 16); - PCKEV_ST_SB(out4, out5, dst + 32); - PCKEV_ST_SB(out6, out7, dst + 48); - dst += dst_stride; - } -} - -void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8]; - - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0) { - switch (w) { - case 4: - common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 8: - common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 16: - common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 32: - common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 64: - common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 8: - common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 16: - common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 32: - common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 64: - common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c deleted file mode 100644 index aa962b41f..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c +++ /dev/null @@ -1,701 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; - v16i8 src10998, filt0, filt1, filt2, filt3; - v16u8 out; - v8i16 filt, out10, out32; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, - src4332, src6554); - XORI_B3_128_SB(src2110, src4332, src6554); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); - XORI_B2_128_SB(src8776, src10998); - out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, - filt1, filt2, filt3); - out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, - filt1, filt2, filt3); - SRARI_H2_SH(out10, out32, FILTER_BITS); - SAT_SH2_SH(out10, out32, 7); - out = PCKEV_XORI128_UB(out10, out32); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - src2110 = src6554; - src4332 = src8776; - src6554 = src10998; - src6 = src10; - } -} - -static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; - v16u8 tmp0, tmp1; - v8i16 filt, out0_r, out1_r, out2_r, out3_r; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); - tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src6 = src10; - } -} - -static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt0, filt1, filt2, filt3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16u8 tmp0, tmp1, tmp2, tmp3; - v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, - tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } -} - -static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height, - int32_t width) { - const uint8_t *src_tmp; - uint8_t *dst_tmp; - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt0, filt1, filt2, filt3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16u8 tmp0, tmp1, tmp2, tmp3; - v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src_tmp += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src_tmp += (4 * src_stride); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, - out3_r, tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); - dst_tmp += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } - - src += 16; - dst += 16; - } -} - -static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - 32); -} - -static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - 64); -} - -static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4; - v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; - v16u8 filt0; - v8i16 filt; - v8u16 tmp0, tmp1; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - src += (5 * src_stride); - - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 filt; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - src8 = LD_SB(src); - src += src_stride; - - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, - src76_r, src87_r); - ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, - src76_r, src2110, src4332, src6554, src8776); - DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); -} - -static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; - v16i8 out0, out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); - ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); -} - -static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v16i8 out0, out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); - src += (8 * src_stride); - - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, - vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - src0 = src8; - } -} - -static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - dst += dst_stride; - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst); - dst += dst_stride; - - src0 = src4; - } -} - -static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src5 = LD_UB(src + 16); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - - LD_UB4(src + 16, src_stride, src6, src7, src8, src9); - src += (4 * src_stride); - - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); - - ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); - ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 16); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); - - ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); - ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); - dst += (4 * dst_stride); - - src0 = src4; - src5 = src9; - } -} - -static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB4(src, 16, src0, src3, src6, src9); - src += src_stride; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_UB2(src, src_stride, src1, src2); - LD_UB2(src + 16, src_stride, src4, src5); - LD_UB2(src + 32, src_stride, src7, src8); - LD_UB2(src + 48, src_stride, src10, src11); - src += (2 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); - - ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); - ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_ST_SB(tmp4, tmp5, dst + 16); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); - - ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); - ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 32); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); - - ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); - ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_ST_SB(tmp4, tmp5, dst + 48); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); - dst += (2 * dst_stride); - - src0 = src2; - src3 = src5; - src6 = src8; - src9 = src11; - } -} - -void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 8; cnt--;) { - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 8: - common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 16: - common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 32: - common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 64: - common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 8: - common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 16: - common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 32: - common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 64: - common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c deleted file mode 100644 index f7f116f4d..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <string.h> -#include "aom_dsp/mips/macros_msa.h" - -static void copy_width8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - out4 = __msa_copy_u_d((v2i64)src4, 0); - out5 = __msa_copy_u_d((v2i64)src5, 0); - out6 = __msa_copy_u_d((v2i64)src6, 0); - out7 = __msa_copy_u_d((v2i64)src7, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - for (cnt = height >> 3; cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - out4 = __msa_copy_u_d((v2i64)src4, 0); - out5 = __msa_copy_u_d((v2i64)src5, 0); - out6 = __msa_copy_u_d((v2i64)src6, 0); - out7 = __msa_copy_u_d((v2i64)src7, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 4) { - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 2) { - for (cnt = (height / 2); cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - - SD(out0, dst); - dst += dst_stride; - SD(out1, dst); - dst += dst_stride; - } - } -} - -static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height, int32_t width) { - int32_t cnt, loop_cnt; - const uint8_t *src_tmp; - uint8_t *dst_tmp; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, - src7); - src_tmp += (8 * src_stride); - - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, - dst_stride); - dst_tmp += (8 * dst_stride); - } - - src += 16; - dst += 16; - } -} - -static void copy_width16_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); - dst += (8 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); - } else if (0 == height % 4) { - for (cnt = (height >> 2); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } - } -} - -static void copy_width32_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); - } else if (0 == height % 4) { - for (cnt = (height >> 2); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } - } -} - -static void copy_width64_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); -} - -void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - - switch (w) { - case 4: { - uint32_t cnt, tmp; - /* 1 word storage */ - for (cnt = h; cnt--;) { - tmp = LW(src); - SW(tmp, dst); - src += src_stride; - dst += dst_stride; - } - break; - } - case 8: { - copy_width8_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 16: { - copy_width16_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 32: { - copy_width32_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 64: { - copy_width64_msa(src, src_stride, dst, dst_stride, h); - break; - } - default: { - uint32_t cnt; - for (cnt = h; cnt--;) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h deleted file mode 100644 index 852415c20..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ -#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/aom_filter.h" - -extern const uint8_t mc_filt_mask_arr[16 * 3]; - -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ - filt3) \ - ({ \ - v8i16 tmp_dpadd_0, tmp_dpadd_1; \ - \ - tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ - tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ - tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ - \ - tmp_dpadd_0; \ - }) - -#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, mask3, filt0, filt1, filt2, filt3, \ - out0, out1) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ - ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ - } - -#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, mask3, filt0, filt1, filt2, filt3, \ - out0, out1, out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ - res4_m, res5_m, res6_m, res7_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ - res4_m, res5_m, res6_m, res7_m); \ - ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ - res7_m, out0, out1, out2, out3); \ - } - -#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c deleted file mode 100644 index 00ab75dc3..000000000 --- a/third_party/aom/aom_dsp/mips/common_dspr2.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; -uint8_t *aom_ff_cropTbl; - -void aom_dsputil_static_init(void) { - int i; - - for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; - - for (i = 0; i < CROP_WIDTH; i++) { - aom_ff_cropTbl_a[i] = 0; - aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; - } - - aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; -} - -#endif diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h deleted file mode 100644 index c42188d62..000000000 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif -#if HAVE_DSPR2 -#define CROP_WIDTH 512 - -extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" - -static INLINE void prefetch_load(const unsigned char *src) { - __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); -} - -/* prefetch data for store */ -static INLINE void prefetch_store(unsigned char *dst) { - __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); -} - -static INLINE void prefetch_load_streamed(const unsigned char *src) { - __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); -} - -/* prefetch data for store */ -static INLINE void prefetch_store_streamed(unsigned char *dst) { - __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c deleted file mode 100644 index 08bf1ab30..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c +++ /dev/null @@ -1,1031 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_horiz_4_transposed_dspr2( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - int32_t Temp1, Temp2; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - dst_ptr = dst; - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp2](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p1], %[Temp1](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [src] "r"(src), [dst_stride] "r"(dst_stride)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_bi_horiz_8_transposed_dspr2( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4; - uint8_t *odd_dst; - uint32_t dst_pitch_2 = (dst_stride << 1); - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - dst_ptr = dst; - odd_dst = (dst_ptr + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "extp %[p3], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[Temp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[Temp1], %[p3](%[cm]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[Temp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[tp3], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[tp3], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[p1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p2], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p1], 0(%[odd_dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), - [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_bi_horiz_16_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "ulw %[qload1], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload1] " - "\n\t" - "ulw %[qload2], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += 1; - } -} - -static void convolve_bi_horiz_64_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "ulw %[qload1], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload1] " - "\n\t" - "ulw %[qload2], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += 1; - } -} - -void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, int w, int h) { - int x, y; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - int sum = 0; - - sum += src[x] * filter[3]; - sum += src[x + 1] * filter[4]; - - dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - } - - src += src_stride; - dst += 1; - } -} - -void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter, int w, - int h) { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - - switch (w) { - case 4: - convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - case 8: - convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - case 16: - case 32: - convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h, (w / 16)); - break; - case 64: - prefetch_load(src + 32); - convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - default: - convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, - h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c deleted file mode 100644 index 097da73ca..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ /dev/null @@ -1,681 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p1], %[Temp2](%[cm]) \n\t" - "lbux %[p2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst]) \n\t" - "sb %[p1], 1(%[dst]) \n\t" - "sb %[tp2], 2(%[dst]) \n\t" - "sb %[p2], 3(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=&r"(Temp4) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4; - uint32_t st0, st1; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[st0], 0(%[dst]) \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[st1], 2(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[st0], 4(%[dst]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[st1], 1(%[dst]) \n\t" - "sb %[st0], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[p1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 3(%[dst]) \n\t" - "sb %[p2], 5(%[dst]) \n\t" - "sb %[p1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(x_step_q4 == 16); - - prefetch_load((const uint8_t *)filter_x); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 8: - convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 16: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 1); - break; - case 32: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c deleted file mode 100644 index 40abfd89e..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(y_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, - h); - break; - case 64: - prefetch_store(dst + 32); - convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c deleted file mode 100644 index af54b4264..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { - int x, y; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: { - uint32_t tp1; - - /* 1 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], (%[src]) \n\t" - "sw %[tp1], (%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 8: { - uint32_t tp1, tp2; - - /* 2 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 16: { - uint32_t tp1, tp2, tp3, tp4; - - /* 4 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 32: { - uint32_t tp1, tp2, tp3, tp4; - uint32_t tp5, tp6, tp7, tp8; - - /* 8 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - "ulw %[tp5], 16(%[src]) \n\t" - "ulw %[tp6], 20(%[src]) \n\t" - "ulw %[tp7], 24(%[src]) \n\t" - "ulw %[tp8], 28(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), - [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 64: { - uint32_t tp1, tp2, tp3, tp4; - uint32_t tp5, tp6, tp7, tp8; - - prefetch_load(src + 64); - prefetch_store(dst + 32); - - /* 16 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_load(src + src_stride + 64); - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - "ulw %[tp5], 16(%[src]) \n\t" - "ulw %[tp6], 20(%[src]) \n\t" - "ulw %[tp7], 24(%[src]) \n\t" - "ulw %[tp8], 28(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ - - "ulw %[tp1], 32(%[src]) \n\t" - "ulw %[tp2], 36(%[src]) \n\t" - "ulw %[tp3], 40(%[src]) \n\t" - "ulw %[tp4], 44(%[src]) \n\t" - "ulw %[tp5], 48(%[src]) \n\t" - "ulw %[tp6], 52(%[src]) \n\t" - "ulw %[tp7], 56(%[src]) \n\t" - "ulw %[tp8], 60(%[src]) \n\t" - - "sw %[tp1], 32(%[dst]) \n\t" /* store */ - "sw %[tp2], 36(%[dst]) \n\t" /* store */ - "sw %[tp3], 40(%[dst]) \n\t" /* store */ - "sw %[tp4], 44(%[dst]) \n\t" /* store */ - "sw %[tp5], 48(%[dst]) \n\t" /* store */ - "sw %[tp6], 52(%[dst]) \n\t" /* store */ - "sw %[tp7], 56(%[dst]) \n\t" /* store */ - "sw %[tp8], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), - [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - default: - for (y = h; y--;) { - for (x = 0; x < w; ++x) { - dst[x] = src[x]; - } - - src += src_stride; - dst += dst_stride; - } - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c deleted file mode 100644 index f9c6879ab..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c +++ /dev/null @@ -1,879 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t n1, n2, n3, n4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[n1], %[tp2] \n\t" - "preceu.ph.qbl %[n2], %[tp2] \n\t" - "preceu.ph.qbr %[n3], %[tn2] \n\t" - "preceu.ph.qbl %[n4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[n1], %[tn1] \n\t" - "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[tn1], %[Temp2](%[cm]) \n\t" - "lbux %[n2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst]) \n\t" - "sb %[tn1], 1(%[dst]) \n\t" - "sb %[tp2], 2(%[dst]) \n\t" - "sb %[n2], 3(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), - [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4, n1; - uint32_t tn1, tn2, tn3; - uint32_t st0, st1; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "preceu.ph.qbl %[n1], %[tn2] \n\t" - "ulw %[tn1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tn1] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[st0], 0(%[dst]) \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - - "balign %[tn3], %[tn1], 3 \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[st1], 2(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "sb %[st0], 4(%[dst]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "preceu.ph.qbl %[n1], %[tn1] \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[tn3] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[st1], 1(%[dst]) \n\t" - "sb %[st0], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[n1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 3(%[dst]) \n\t" - "sb %[p2], 5(%[dst]) \n\t" - "sb %[n1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), - [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, - uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, - uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - prefetch_load((const uint8_t *)filter_x); - src -= 3; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 8: - convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 16: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 1); - break; - case 32: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - default: - aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c deleted file mode 100644 index 201e66427..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); - break; - case 64: - prefetch_store(dst + 32); - convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} - -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h deleted file mode 100644 index e5d48a884..000000000 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter, int w, - int h); - -void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c deleted file mode 100644 index 7c221ae89..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; - - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - "lb %[tmp9], 8(%[left]) \n\t" - "lb %[tmp10], 9(%[left]) \n\t" - "lb %[tmp11], 10(%[left]) \n\t" - "lb %[tmp12], 11(%[left]) \n\t" - "lb %[tmp13], 12(%[left]) \n\t" - "lb %[tmp14], 13(%[left]) \n\t" - "lb %[tmp15], 14(%[left]) \n\t" - "lb %[tmp16], 15(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - "replv.qb %[tmp9], %[tmp9] \n\t" - "replv.qb %[tmp10], %[tmp10] \n\t" - "replv.qb %[tmp11], %[tmp11] \n\t" - "replv.qb %[tmp12], %[tmp12] \n\t" - "replv.qb %[tmp13], %[tmp13] \n\t" - "replv.qb %[tmp14], %[tmp14] \n\t" - "replv.qb %[tmp15], %[tmp15] \n\t" - "replv.qb %[tmp16], %[tmp16] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "sw %[tmp1], 8(%[dst]) \n\t" - "sw %[tmp1], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "sw %[tmp2], 8(%[dst]) \n\t" - "sw %[tmp2], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "sw %[tmp3], 8(%[dst]) \n\t" - "sw %[tmp3], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "sw %[tmp4], 8(%[dst]) \n\t" - "sw %[tmp4], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "sw %[tmp5], 8(%[dst]) \n\t" - "sw %[tmp5], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "sw %[tmp6], 8(%[dst]) \n\t" - "sw %[tmp6], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "sw %[tmp7], 8(%[dst]) \n\t" - "sw %[tmp7], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - "sw %[tmp8], 8(%[dst]) \n\t" - "sw %[tmp8], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp9], (%[dst]) \n\t" - "sw %[tmp9], 4(%[dst]) \n\t" - "sw %[tmp9], 8(%[dst]) \n\t" - "sw %[tmp9], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp10], (%[dst]) \n\t" - "sw %[tmp10], 4(%[dst]) \n\t" - "sw %[tmp10], 8(%[dst]) \n\t" - "sw %[tmp10], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp11], (%[dst]) \n\t" - "sw %[tmp11], 4(%[dst]) \n\t" - "sw %[tmp11], 8(%[dst]) \n\t" - "sw %[tmp11], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp12], (%[dst]) \n\t" - "sw %[tmp12], 4(%[dst]) \n\t" - "sw %[tmp12], 8(%[dst]) \n\t" - "sw %[tmp12], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp13], (%[dst]) \n\t" - "sw %[tmp13], 4(%[dst]) \n\t" - "sw %[tmp13], 8(%[dst]) \n\t" - "sw %[tmp13], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp14], (%[dst]) \n\t" - "sw %[tmp14], 4(%[dst]) \n\t" - "sw %[tmp14], 8(%[dst]) \n\t" - "sw %[tmp14], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp15], (%[dst]) \n\t" - "sw %[tmp15], 4(%[dst]) \n\t" - "sw %[tmp15], 8(%[dst]) \n\t" - "sw %[tmp15], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp16], (%[dst]) \n\t" - "sw %[tmp16], 4(%[dst]) \n\t" - "sw %[tmp16], 8(%[dst]) \n\t" - "sw %[tmp16], 12(%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), - [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), - [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), - [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), - [tmp16] "=&r"(tmp16) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, left2; - - __asm__ __volatile__( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "lw %[above1], 8(%[above]) \n\t" - "lw %[above2], 12(%[above]) \n\t" - "lw %[left1], 8(%[left]) \n\t" - "lw %[left2], 12(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addiu %[average], %[average], 16 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 5 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), - [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), - [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), - [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c deleted file mode 100644 index 0a21979c7..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4; - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "sw %[tmp1], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; - - __asm__ __volatile__( - "lw %[above_c], (%[above]) \n\t" - "lw %[left_c], (%[left]) \n\t" - - "preceu.ph.qbl %[above_l], %[above_c] \n\t" - "preceu.ph.qbr %[above_r], %[above_c] \n\t" - "preceu.ph.qbl %[left_l], %[left_c] \n\t" - "preceu.ph.qbr %[left_r], %[left_c] \n\t" - - "addu.ph %[average], %[above_r], %[above_l] \n\t" - "addu.ph %[average], %[average], %[left_l] \n\t" - "addu.ph %[average], %[average], %[left_r] \n\t" - "addiu %[average], %[average], 4 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 3 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - - : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), - [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), - [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), - [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c deleted file mode 100644 index d42a77c80..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), - [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; - - __asm__ __volatile__( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "preceu.ph.qbl %[above_l2], %[above2] \n\t" - "preceu.ph.qbr %[above_r2], %[above2] \n\t" - "preceu.ph.qbl %[left_l2], %[left2] \n\t" - "preceu.ph.qbr %[left_r2], %[left2] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addu.ph %[average], %[average], %[above_l2] \n\t" - "addu.ph %[average], %[average], %[above_r2] \n\t" - "addu.ph %[average], %[average], %[left_l2] \n\t" - "addu.ph %[average], %[average], %[left_r2] \n\t" - - "addiu %[average], %[average], 8 \n\t" - - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 4 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), - [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), - [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), - [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), - [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), - [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), - [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c deleted file mode 100644 index 9f25cc1ca..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ - { \ - out0 = __msa_subs_u_h(out0, in0); \ - out1 = __msa_subs_u_h(out1, in1); \ - } - -static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t src_data; - - src_data = LW(src); - - SW4(src_data, src_data, src_data, src_data, dst, dst_stride); -} - -static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint32_t src_data1, src_data2; - - src_data1 = LW(src); - src_data2 = LW(src + 4); - - for (row = 8; row--;) { - SW(src_data1, dst); - SW(src_data2, (dst + 4)); - dst += dst_stride; - } -} - -static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src0; - - src0 = LD_UB(src); - - for (row = 16; row--;) { - ST_UB(src0, dst); - dst += dst_stride; - } -} - -static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src1, src2; - - src1 = LD_UB(src); - src2 = LD_UB(src + 16); - - for (row = 32; row--;) { - ST_UB2(src1, src2, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t out0, out1, out2, out3; - - out0 = src[0] * 0x01010101; - out1 = src[1] * 0x01010101; - out2 = src[2] * 0x01010101; - out3 = src[3] * 0x01010101; - - SW4(out0, out1, out2, out3, dst, dst_stride); -} - -static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - out0 = src[0] * 0x0101010101010101ull; - out1 = src[1] * 0x0101010101010101ull; - out2 = src[2] * 0x0101010101010101ull; - out3 = src[3] * 0x0101010101010101ull; - out4 = src[4] * 0x0101010101010101ull; - out5 = src[5] * 0x0101010101010101ull; - out6 = src[6] * 0x0101010101010101ull; - out7 = src[7] * 0x0101010101010101ull; - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); -} - -static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 4; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 8; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB2(src0, src0, dst, 16); - dst += dst_stride; - ST_UB2(src1, src1, dst, 16); - dst += dst_stride; - ST_UB2(src2, src2, dst, 16); - dst += dst_stride; - ST_UB2(src3, src3, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_4x4_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t val0, val1; - v16i8 store, src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LW(src_top); - val1 = LW(src_left); - INSERT_W2_SB(val0, val1, src); - sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t val0; - v16i8 store, data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - - val0 = LW(src); - data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); - sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_w((v4i32)store, 0); - - SW4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_8x8_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint64_t val0, val1; - v16i8 store; - v16u8 src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src_top); - val1 = LD(src_left); - INSERT_D2_UB(val0, val1, src); - sum_h = __msa_hadd_u_h(src, src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t val0; - v16i8 store; - v16u8 data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src); - data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { - uint64_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_d((v2i64)store, 0); - - SD4(out, out, out, out, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_16x16_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - v16u8 top, left, out; - v8u16 sum_h, sum_top, sum_left; - v4u32 sum_w; - v2u64 sum_d; - - top = LD_UB(src_top); - left = LD_UB(src_left); - HADD_UB2_UH(top, left, sum_top, sum_left); - sum_h = sum_top + sum_left; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - v16u8 data, out; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - data = LD_UB(src); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { - const v16u8 out = (v16u8)__msa_ldi_b(128); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 top0, top1, left0, left1, out; - v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src_top, 16, top0, top1); - LD_UB2(src_left, 16, left0, left1); - HADD_UB2_UH(top0, top1, sum_top0, sum_top1); - HADD_UB2_UH(left0, left1, sum_left0, sum_left1); - sum_h = sum_top0 + sum_top1; - sum_h += sum_left0 + sum_left1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 data0, data1, out; - v8u16 sum_h, sum_data0, sum_data1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src, 16, data0, data1); - HADD_UB2_UH(data0, data1, sum_data0, sum_data1); - sum_h = sum_data0 + sum_data1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t row; - const v16u8 out = (v16u8)__msa_ldi_b(128); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_4x4_msa(above, dst, y_stride); -} - -void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_8x8_msa(above, dst, y_stride); -} - -void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_16x16_msa(above, dst, y_stride); -} - -void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_32x32_msa(above, dst, y_stride); -} - -void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_4x4_msa(left, dst, y_stride); -} - -void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_8x8_msa(left, dst, y_stride); -} - -void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_16x16_msa(left, dst, y_stride); -} - -void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_32x32_msa(left, dst, y_stride); -} - -void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_4x4_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_8x8_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_16x16_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_32x32_msa(above, left, dst, y_stride); -} - -void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_4x4_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_8x8_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_16x16_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_32x32_msa(above, dst, y_stride); -} - -void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_4x4_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_8x8_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_16x16_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_32x32_msa(left, dst, y_stride); -} - -void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_4x4_msa(dst, y_stride); -} - -void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_8x8_msa(dst, y_stride); -} - -void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_16x16_msa(dst, y_stride); -} - -void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_32x32_msa(dst, y_stride); -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c deleted file mode 100644 index 38a10e9b2..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c +++ /dev/null @@ -1,1488 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_ports/mem.h" -#include "aom_dsp/mips/loopfilter_msa.h" - -int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - - /* load vector elements */ - LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); - - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { - v16u8 flat, flat2, filter8; - v16i8 zero = { 0 }; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; - v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; - v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; - v8i16 l_out, r_out; - - flat = LD_UB(filter48 + 96); - - LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - src -= 3 * pitch; - ST_UB4(p2, p1, p0, q0, src, pitch); - src += (4 * pitch); - ST_UB2(q1, q2, src, pitch); - } else { - src -= 7 * pitch; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, - p5_l_in, p4_l_in); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, - p1_l_in, p0_l_in); - q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); - - tmp0_l = p7_l_in << 3; - tmp0_l -= p7_l_in; - tmp0_l += p6_l_in; - tmp0_l += q0_l_in; - tmp1_l = p6_l_in + p5_l_in; - tmp1_l += p4_l_in; - tmp1_l += p3_l_in; - tmp1_l += p2_l_in; - tmp1_l += p1_l_in; - tmp1_l += p0_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST_UB(p6, src); - src += pitch; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); - tmp0_l = p5_l_in - p6_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST_UB(p5, src); - src += pitch; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); - - q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); - tmp0_l = p4_l_in - p5_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST_UB(p4, src); - src += pitch; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); - tmp0_l = p3_l_in - p4_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST_UB(p3, src); - src += pitch; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); - tmp0_l = p2_l_in - p3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); - tmp0_l = p1_l_in - p2_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); - tmp0_l = p0_l_in - p1_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); - tmp0_l = q7_l_in - p0_l_in; - tmp0_l += q0_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q0_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p6_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q1_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p5_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q2_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p4_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST_UB(q3, src); - src += pitch; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p3_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST_UB(q4, src); - src += pitch; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q4_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p2_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST_UB(q5, src); - src += pitch; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q5_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p1_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST_UB(q6, src); - } -} - -static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { - DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); - uint8_t early_exit = 0; - - (void)count; - - early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, - limit_ptr, thresh_ptr); - - if (0 == early_exit) { - aom_hz_lpf_t16_16w(src, pitch, filter48); - } -} - -static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, int32_t count) { - if (1 == count) { - uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; - uint64_t dword0, dword1; - v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 p0_filter16, p1_filter16; - v8i16 p2_filter8, p1_filter8, p0_filter8; - v8i16 q0_filter8, q1_filter8, q2_filter8; - v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; - v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; - v16i8 zero = { 0 }; - v8u16 tmp0, tmp1, tmp2; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, - q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); - } else { - /* convert 8 bit input data into 16 bit */ - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, - p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, - q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); - PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); - - /* load 16 vector elements */ - LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); - LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - p2_d = __msa_copy_u_d((v2i64)p2_out, 0); - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - q2_d = __msa_copy_u_d((v2i64)q2_out, 0); - - SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); - SD(q1_d, src + pitch); - SD(q2_d, src + 2 * pitch); - } else { - /* LSB(right) 8 pixel operation */ - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, - zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, - q7_r); - - tmp0 = p7_r << 3; - tmp0 -= p7_r; - tmp0 += p6_r; - tmp0 += q0_r; - - src -= 7 * pitch; - - /* calculation of p6 and p5 */ - tmp1 = p6_r + p5_r + p4_r + p3_r; - tmp1 += (p2_r + p1_r + p0_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp0 = p5_r - p6_r + q1_r - p7_r; - tmp1 += tmp0; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p4 and p3 */ - tmp0 = p4_r - p5_r + q2_r - p7_r; - tmp2 = p3_r - p4_r + q3_r - p7_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p2 and p1 */ - tmp0 = p2_r - p3_r + q4_r - p7_r; - tmp2 = p1_r - p2_r + q5_r - p7_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p0 and q0 */ - tmp0 = (p0_r - p1_r) + (q6_r - p7_r); - tmp2 = (q7_r - p0_r) + (q0_r - p7_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q1 and q2 */ - tmp0 = q7_r - q0_r + q1_r - p6_r; - tmp2 = q7_r - q1_r + q2_r - p5_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q3 and q4 */ - tmp0 = (q7_r - q2_r) + (q3_r - p4_r); - tmp2 = (q7_r - q3_r) + (q4_r - p3_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q5 and q6 */ - tmp0 = (q7_r - q4_r) + (q5_r - p2_r); - tmp2 = (q7_r - q5_r) + (q6_r - p1_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - } - } - } else { - mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, - count); - } -} - -void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); -} - -void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); -} - -static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, - uint8_t *output, int32_t out_pitch) { - v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; - v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - - LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, - p1_org, p0_org); - /* 8x8 transpose */ - TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, - p0_org, p7, p6, p5, p4, p3, p2, p1, p0); - /* 8x8 transpose */ - ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, - tmp0, tmp1, tmp2, tmp3); - ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); - ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); - ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); - ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); - SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); - - ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); - output += (8 * out_pitch); - ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); -} - -static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, - uint8_t *output, int32_t out_pitch) { - v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - - LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); - TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, - q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); - ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); -} - -static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, - int32_t out_pitch) { - v16u8 row0, row1, row2, row3, row4, row5, row6, row7; - v16u8 row8, row9, row10, row11, row12, row13, row14, row15; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp2, tmp3; - - LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); - input += (8 * in_pitch); - LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); - - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, - row9, row10, row11, row12, row13, row14, row15, p7, p6, - p5, p4, p3, p2, p1, p0); - - /* transpose 16x8 matrix into 8x16 */ - /* total 8 intermediate register and 32 instructions */ - q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); - q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); - q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); - q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); - q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); - q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); - q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); - q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); - - ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); - tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); - tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); - - ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); - tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); - tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); - - ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); - q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); - tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); - q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); - q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); - tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); - q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); - output += (8 * out_pitch); - ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); -} - -int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch_org, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v16i8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3; - - /* load vector elements */ - LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - /* convert 16 bit output data into 8 bit */ - p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); - p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); - p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); - q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); - q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); - q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { - v16i8 zero = { 0 }; - v16u8 filter8, flat, flat2; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 tmp0_r, tmp1_r; - v8i16 r_out; - - flat = LD_UB(filter48 + 6 * 16); - - LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - v8i16 vec0, vec1, vec2, vec3, vec4; - - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); - - src_org -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 4, (src_org + 4), pitch); - - return 1; - } else { - src -= 7 * 16; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST8x1_UB(p6, src); - src += 16; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST8x1_UB(p5, src); - src += 16; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST8x1_UB(p4, src); - src += 16; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST8x1_UB(p3, src); - src += 16; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST8x1_UB(q3, src); - src += 16; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST8x1_UB(q4, src); - src += 16; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST8x1_UB(q5, src); - src += 16; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST8x1_UB(q6, src); - - return 0; - } -} - -void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint8_t early_exit = 0; - DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); - uint8_t *filter48 = &transposed_input[16 * 16]; - - transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); - - early_exit = - aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); - - if (0 == early_exit) { - early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); - - if (0 == early_exit) { - transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); - } - } -} - -int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16i8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - - /* load vector elements */ - LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec4, vec5); - - src_org -= 2; - ST4x8_UB(vec2, vec3, src_org, pitch); - src_org += 8 * pitch; - ST4x8_UB(vec4, vec5, src_org, pitch); - - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { - v16u8 flat, flat2, filter8; - v16i8 zero = { 0 }; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; - v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; - v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; - v8i16 l_out, r_out; - - flat = LD_UB(filter48 + 6 * 16); - - LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec6, vec7); - ILVRL_B2_SH(q2, q1, vec2, vec5); - - src_org -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 4, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec5, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec5, 4, (src_org + 4), pitch); - - return 1; - } else { - src -= 7 * 16; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, - p5_l_in, p4_l_in); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, - p1_l_in, p0_l_in); - q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); - - tmp0_l = p7_l_in << 3; - tmp0_l -= p7_l_in; - tmp0_l += p6_l_in; - tmp0_l += q0_l_in; - tmp1_l = p6_l_in + p5_l_in; - tmp1_l += p4_l_in; - tmp1_l += p3_l_in; - tmp1_l += p2_l_in; - tmp1_l += p1_l_in; - tmp1_l += p0_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST_UB(p6, src); - src += 16; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); - tmp0_l = p5_l_in - p6_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST_UB(p5, src); - src += 16; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); - tmp0_l = p4_l_in - p5_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST_UB(p4, src); - src += 16; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); - tmp0_l = p3_l_in - p4_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST_UB(p3, src); - src += 16; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); - tmp0_l = p2_l_in - p3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); - tmp0_l = p1_l_in - p2_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)(tmp1_l), 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); - tmp0_l = p0_l_in - p1_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); - tmp0_l = q7_l_in - p0_l_in; - tmp0_l += q0_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q0_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p6_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q1_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p5_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q2_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p4_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST_UB(q3, src); - src += 16; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p3_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST_UB(q4, src); - src += 16; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q4_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p2_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST_UB(q5, src); - src += 16; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q5_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p1_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST_UB(q6, src); - - return 0; - } -} - -void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint8_t early_exit = 0; - DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); - uint8_t *filter48 = &transposed_input[16 * 16]; - - transpose_16x16((src - 8), pitch, &transposed_input[0], 16); - - early_exit = - aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); - - if (0 == early_exit) { - early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); - - if (0 == early_exit) { - transpose_16x16(transposed_input, 16, (src - 8), pitch); - } - } -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c deleted file mode 100644 index dc0a97764..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/loopfilter_msa.h" - -void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint64_t p1_d, p0_d, q0_d, q1_d; - v16u8 mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); -} - -void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0_ptr, - const uint8_t *limit0_ptr, - const uint8_t *thresh0_ptr, - const uint8_t *b_limit1_ptr, - const uint8_t *limit1_ptr, - const uint8_t *thresh1_ptr) { - v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); - thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); - thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); - - b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); - b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); - b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); - - limit0 = (v16u8)__msa_fill_b(*limit0_ptr); - limit1 = (v16u8)__msa_fill_b(*limit1_ptr); - limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, - mask, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - - ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); -} - -void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 mask, hev, flat, limit, thresh, b_limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v8i16 vec0, vec1, vec2, vec3; - - LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, - q3); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - - src -= 2; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - src += 4 * pitch; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); -} - -void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0_ptr, - const uint8_t *limit0_ptr, - const uint8_t *thresh0_ptr, - const uint8_t *b_limit1_ptr, - const uint8_t *limit1_ptr, - const uint8_t *thresh1_ptr) { - v16u8 mask, hev, flat; - v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 row0, row1, row2, row3, row4, row5, row6, row7; - v16u8 row8, row9, row10, row11, row12, row13, row14, row15; - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - - LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); - LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, - row14, row15); - - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, - row9, row10, row11, row12, row13, row14, row15, p3, p2, - p1, p0, q0, q1, q2, q3); - - thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); - thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); - thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); - - b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); - b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); - b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); - - limit0 = (v16u8)__msa_fill_b(*limit0_ptr); - limit1 = (v16u8)__msa_fill_b(*limit1_ptr); - limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, - mask, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); - ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); - ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); - ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); - - src -= 2; - - ST4x8_UB(tmp2, tmp3, src, pitch); - src += (8 * pitch); - ST4x8_UB(tmp4, tmp5, src, pitch); -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c deleted file mode 100644 index dc203e79c..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/loopfilter_msa.h" - -void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; - v16u8 mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; - v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; - v16i8 zero = { 0 }; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, - p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, - q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); - PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); - - p2_d = __msa_copy_u_d((v2i64)p2_out, 0); - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - q2_d = __msa_copy_u_d((v2i64)q2_out, 0); - - src -= 3 * pitch; - - SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); - src += (4 * pitch); - SD(q1_d, src); - src += pitch; - SD(q2_d, src); - } -} - -void aom_lpf_horizontal_8_dual_msa( - uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, - const uint8_t *thresh1) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - - /* load vector elements */ - LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh0); - tmp = (v16u8)__msa_fill_b(*thresh1); - thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); - - b_limit = (v16u8)__msa_fill_b(*b_limit0); - tmp = (v16u8)__msa_fill_b(*b_limit1); - b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); - - limit = (v16u8)__msa_fill_b(*limit0); - tmp = (v16u8)__msa_fill_b(*limit1); - limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - src -= 3 * pitch; - - ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); - src += (4 * pitch); - ST_UB2(q1_out, q2_out, src, pitch); - src += (2 * pitch); - } -} - -void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p1_out, p0_out, q0_out, q1_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v16u8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4; - - /* load vector elements */ - LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, - q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - /* Store 4 pixels p1-_q1 */ - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - - src -= 2; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - src += 4 * pitch; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, - p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - /* Store 6 pixels p2-_q2 */ - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); - - src -= 3; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec4, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec4, 4, src + 4, pitch); - } -} - -void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0, const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *b_limit1, const uint8_t *limit1, - const uint8_t *thresh1) { - uint8_t *temp_src; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p1_out, p0_out, q0_out, q1_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v16u8 row4, row5, row6, row7, row12, row13, row14, row15; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - - temp_src = src - 4; - - LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); - temp_src += (8 * pitch); - LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); - - /* transpose 16x8 matrix into 8x16 */ - TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, - row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, - q3); - - thresh = (v16u8)__msa_fill_b(*thresh0); - vec0 = (v8i16)__msa_fill_b(*thresh1); - thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); - - b_limit = (v16u8)__msa_fill_b(*b_limit0); - vec0 = (v8i16)__msa_fill_b(*b_limit1); - b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); - - limit = (v16u8)__msa_fill_b(*limit0); - vec0 = (v8i16)__msa_fill_b(*limit1); - limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec4, vec5); - - src -= 2; - ST4x8_UB(vec2, vec3, src, pitch); - src += 8 * pitch; - ST4x8_UB(vec4, vec5, src, pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - - /* filter8 */ - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec6, vec7); - ILVRL_B2_SH(q2, q1, vec2, vec5); - - src -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec2, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec2, 4, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec5, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec5, 4, src + 4, pitch); - } -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c deleted file mode 100644 index 8c41278be..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask; - uint32_t hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - /* loop filter designed to work using chars so that we can make maximum use - of 8 bit simd instructions. */ - for (i = 0; i < 2; i++) { - sm1 = s - (pitch << 2); - s0 = sm1 + pitch; - s1 = s0 + pitch; - s2 = s - pitch; - s3 = s; - s4 = s + pitch; - s5 = s4 + pitch; - s6 = s5 + pitch; - - __asm__ __volatile__( - "lw %[p1], (%[s1]) \n\t" - "lw %[p2], (%[s2]) \n\t" - "lw %[p3], (%[s3]) \n\t" - "lw %[p4], (%[s4]) \n\t" - - : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* if (p1 - p4 == 0) and (p2 - p3 == 0) - mask will be zero and filtering is not needed */ - if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - __asm__ __volatile__( - "lw %[pm1], (%[sm1]) \n\t" - "lw %[p0], (%[s0]) \n\t" - "lw %[p5], (%[s5]) \n\t" - "lw %[p6], (%[s6]) \n\t" - - : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) - : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); - - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, - p6, thresh_vec, &hev, &mask); - - /* if mask == 0 do filtering is not needed */ - if (mask) { - /* filtering */ - filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); - - __asm__ __volatile__( - "sw %[p1], (%[s1]) \n\t" - "sw %[p2], (%[s2]) \n\t" - "sw %[p3], (%[s3]) \n\t" - "sw %[p4], (%[s4]) \n\t" - - : - : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), - [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - } - } - - s = s + 4; - } -} - -void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - /* load quad-byte vectors - * memory is 4 byte aligned - */ - p2 = *((uint32_t *)(s1 - 4)); - p6 = *((uint32_t *)(s1)); - p1 = *((uint32_t *)(s2 - 4)); - p5 = *((uint32_t *)(s2)); - p0 = *((uint32_t *)(s3 - 4)); - p4 = *((uint32_t *)(s3)); - pm1 = *((uint32_t *)(s4 - 4)); - p3 = *((uint32_t *)(s4)); - - /* transpose pm1, p0, p1, p2 */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" - "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[pm1], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), - [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose p3, p4, p5, p6 */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" - "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" - "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" - - "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" - "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" - "append %[p5], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* if (p1 - p4 == 0) and (p2 - p3 == 0) - * mask will be zero and filtering is not needed - */ - if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, - p6, thresh_vec, &hev, &mask); - - /* if mask == 0 do filtering is not needed */ - if (mask) { - /* filtering */ - filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); - - /* unpack processed 4x4 neighborhood - * don't use transpose on output data - * because memory isn't aligned - */ - __asm__ __volatile__( - "sb %[p4], 1(%[s4]) \n\t" - "sb %[p3], 0(%[s4]) \n\t" - "sb %[p2], -1(%[s4]) \n\t" - "sb %[p1], -2(%[s4]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s4] "r"(s4)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s3]) \n\t" - "sb %[p3], 0(%[s3]) \n\t" - "sb %[p2], -1(%[s3]) \n\t" - "sb %[p1], -2(%[s3]) \n\t" - - : [p1] "+r"(p1) - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s2]) \n\t" - "sb %[p3], 0(%[s2]) \n\t" - "sb %[p2], -1(%[s2]) \n\t" - "sb %[p1], -2(%[s2]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s2] "r"(s2)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s1]) \n\t" - "sb %[p3], 0(%[s1]) \n\t" - "sb %[p2], -1(%[s1]) \n\t" - "sb %[p1], -2(%[s1]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s1] "r"(s1)); - } - } - } -} - -void aom_lpf_horizontal_4_dual_dspr2( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); -} - -void aom_lpf_horizontal_8_dual_dspr2( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); - aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h deleted file mode 100644 index 28f0dc35a..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ /dev/null @@ -1,736 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* inputs & outputs are quad-byte vectors */ -static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, - uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { - int32_t aom_filter_l, aom_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; - - N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; - HWM = 0xFF00FF00; - - vps0 = (*ps0) ^ N128; - vps1 = (*ps1) ^ N128; - vqs0 = (*qs0) ^ N128; - vqs1 = (*qs1) ^ N128; - - /* use halfword pairs instead quad-bytes because of accuracy */ - vps0_l = vps0 & HWM; - vps0_r = vps0 << 8; - vps0_r = vps0_r & HWM; - - vps1_l = vps1 & HWM; - vps1_r = vps1 << 8; - vps1_r = vps1_r & HWM; - - vqs0_l = vqs0 & HWM; - vqs0_r = vqs0 << 8; - vqs0_r = vqs0_r & HWM; - - vqs1_l = vqs1 & HWM; - vqs1_r = vqs1 << 8; - vqs1_r = vqs1_r & HWM; - - mask_l = mask & HWM; - mask_r = mask << 8; - mask_r = mask_r & HWM; - - hev_l = hev & HWM; - hev_r = hev << 8; - hev_r = hev_r & HWM; - - __asm__ __volatile__( - /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ - "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" - "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" - - /* qs0 - ps0 */ - "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" - "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" - - /* aom_filter &= hev; */ - "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" - - /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_l], %[hev_l], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_r], %[hev_r], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - - /* aom_filter &= mask; */ - "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" - - : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), - [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), - [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) - : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), - [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), - [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), - [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), - [HWM] "r"(HWM)); - - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__( - /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ - "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" - "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" - - /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ - "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" - "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" - "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" - "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" - - "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" - "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" - - "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" - "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" - - /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ - "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" - "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" - - /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ - "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - - : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), - [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), - [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), - [vqs0_r] "+r"(vqs0_r) - : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), - [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); - - __asm__ __volatile__( - /* (aom_filter += 1) >>= 1 */ - "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" - "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" - - /* aom_filter &= ~hev; */ - "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" - "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" - - /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ - "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" - "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" - - /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ - "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - - : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), - [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), - [vqs1_r] "+r"(vqs1_r) - : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); - - /* Create quad-bytes from halfword pairs */ - vqs0_l = vqs0_l & HWM; - vqs1_l = vqs1_l & HWM; - vps0_l = vps0_l & HWM; - vps1_l = vps1_l & HWM; - - __asm__ __volatile__( - "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" - "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" - "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" - "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - - : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), - [vqs0_r] "+r"(vqs0_r) - :); - - vqs0 = vqs0_l | vqs0_r; - vqs1 = vqs1_l | vqs1_r; - vps0 = vps0_l | vps0_r; - vps1 = vps1_l | vps1_r; - - *ps0 = vps0 ^ N128; - *ps1 = vps1 ^ N128; - *qs0 = vqs0 ^ N128; - *qs1 = vqs1 ^ N128; -} - -static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, - uint32_t ps0, uint32_t qs0, uint32_t qs1, - uint32_t *p1_f0, uint32_t *p0_f0, - uint32_t *q0_f0, uint32_t *q1_f0) { - int32_t aom_filter_l, aom_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; - - N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; - HWM = 0xFF00FF00; - - vps0 = (ps0) ^ N128; - vps1 = (ps1) ^ N128; - vqs0 = (qs0) ^ N128; - vqs1 = (qs1) ^ N128; - - /* use halfword pairs instead quad-bytes because of accuracy */ - vps0_l = vps0 & HWM; - vps0_r = vps0 << 8; - vps0_r = vps0_r & HWM; - - vps1_l = vps1 & HWM; - vps1_r = vps1 << 8; - vps1_r = vps1_r & HWM; - - vqs0_l = vqs0 & HWM; - vqs0_r = vqs0 << 8; - vqs0_r = vqs0_r & HWM; - - vqs1_l = vqs1 & HWM; - vqs1_r = vqs1 << 8; - vqs1_r = vqs1_r & HWM; - - mask_l = mask & HWM; - mask_r = mask << 8; - mask_r = mask_r & HWM; - - hev_l = hev & HWM; - hev_r = hev << 8; - hev_r = hev_r & HWM; - - __asm__ __volatile__( - /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ - "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" - "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" - - /* qs0 - ps0 */ - "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" - "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" - - /* aom_filter &= hev; */ - "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" - - /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_l], %[hev_l], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_r], %[hev_r], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - - /* aom_filter &= mask; */ - "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" - - : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), - [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), - [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) - : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), - [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), - [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), - [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), - [HWM] "r"(HWM)); - - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__( - /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ - "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" - "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" - - /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ - "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" - "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" - "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" - "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" - - "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" - "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" - - "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" - "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" - - /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ - "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" - "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" - - /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ - "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - - : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), - [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), - [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), - [vqs0_r] "+r"(vqs0_r) - : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), - [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); - - __asm__ __volatile__( - /* (aom_filter += 1) >>= 1 */ - "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" - "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" - - /* aom_filter &= ~hev; */ - "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" - "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" - - /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ - "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" - "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" - - /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ - "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - - : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), - [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), - [vqs1_r] "+r"(vqs1_r) - : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); - - /* Create quad-bytes from halfword pairs */ - vqs0_l = vqs0_l & HWM; - vqs1_l = vqs1_l & HWM; - vps0_l = vps0_l & HWM; - vps1_l = vps1_l & HWM; - - __asm__ __volatile__( - "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" - "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" - "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" - "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - - : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), - [vqs0_r] "+r"(vqs0_r) - :); - - vqs0 = vqs0_l | vqs0_r; - vqs1 = vqs1_l | vqs1_r; - vps0 = vps0_l | vps0_r; - vps1 = vps1_l | vps1_r; - - *p0_f0 = vps0 ^ N128; - *p1_f0 = vps1 ^ N128; - *q0_f0 = vqs0 ^ N128; - *q1_f0 = vqs1 ^ N128; -} - -static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, - uint32_t *op0, uint32_t *oq0, uint32_t *oq1, - uint32_t *oq2, uint32_t *oq3) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; - - /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ - /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ - /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ - /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ - /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ - /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - - __asm__ __volatile__( - "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" - - "shll.ph %[tmp], %[p3], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p3] \n\t" - "addu.ph %[res_op1], %[p3], %[p3] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" - "shrl.ph %[res_op1], %[res_op1], 3 \n\t" - "shrl.ph %[res_op2], %[res_op2], 3 \n\t" - "addu.ph %[res_op0], %[p3], %[p0] \n\t" - "addu.ph %[res_oq0], %[q0], %[q3] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq1], %[q3], %[q3] \n\t" - "shll.ph %[tmp], %[q3], 1 \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" - "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" - "shrl.ph %[res_op0], %[res_op0], 3 \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - - : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), - [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), - [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); - - *op2 = res_op2; - *op1 = res_op1; - *op0 = res_op0; - *oq0 = res_oq0; - *oq1 = res_oq1; - *oq2 = res_oq2; -} - -static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, - uint32_t p0, uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, uint32_t *op2_f1, - uint32_t *op1_f1, uint32_t *op0_f1, - uint32_t *oq0_f1, uint32_t *oq1_f1, - uint32_t *oq2_f1) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; - - /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ - /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ - /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ - /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ - /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ - /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - - __asm__ __volatile__( - "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" - - "shll.ph %[tmp], %[p3], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p3] \n\t" - "addu.ph %[res_op1], %[p3], %[p3] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" - "shrl.ph %[res_op1], %[res_op1], 3 \n\t" - "shrl.ph %[res_op2], %[res_op2], 3 \n\t" - "addu.ph %[res_op0], %[p3], %[p0] \n\t" - "addu.ph %[res_oq0], %[q0], %[q3] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq1], %[q3], %[q3] \n\t" - "shll.ph %[tmp], %[q3], 1 \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" - "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" - "shrl.ph %[res_op0], %[res_op0], 3 \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - - : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), - [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), - [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); - - *op2_f1 = res_op2; - *op1_f1 = res_op1; - *op0_f1 = res_op0; - *oq0_f1 = res_oq0; - *oq1_f1 = res_oq1; - *oq2_f1 = res_oq2; -} - -static INLINE void wide_mbfilter_dspr2( - uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, - uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, - uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, - uint32_t *oq7) { - const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; - const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; - uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; - uint32_t tmp; - uint32_t add_p6toq6; - uint32_t u32Eight = 0x00080008; - - __asm__ __volatile__( - /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 - which is used most of the time */ - "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" - - : [add_p6toq6] "=&r"(add_p6toq6) - : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), - [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), - [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), - [u32Eight] "r"(u32Eight)); - - __asm__ __volatile__( - /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + - p3 + p2 + p1 + p0 + q0, 4) */ - "shll.ph %[tmp], %[p7], 3 \n\t" - "subu.ph %[res_op6], %[tmp], %[p7] \n\t" - "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" - "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" - "shrl.ph %[res_op6], %[res_op6], 4 \n\t" - - /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + - p2 + p1 + p0 + q0 + q1, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op5], %[tmp], %[p7] \n\t" - "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" - "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" - "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" - "shrl.ph %[res_op5], %[res_op5], 4 \n\t" - - /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + - p1 + p0 + q0 + q1 + q2, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op4], %[tmp], %[p7] \n\t" - "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" - "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" - "shrl.ph %[res_op4], %[res_op4], 4 \n\t" - - /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + - p1 + p0 + q0 + q1 + q2 + q3, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op3], %[tmp], %[p3] \n\t" - "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" - "shrl.ph %[res_op3], %[res_op3], 4 \n\t" - - /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + - p0 + q0 + q1 + q2 + q3 + q4, 4) */ - "shll.ph %[tmp], %[p7], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p7] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" - "shrl.ph %[res_op2], %[res_op2], 4 \n\t" - - /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + - p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ - "shll.ph %[tmp], %[p7], 1 \n\t" - "addu.ph %[res_op1], %[tmp], %[p1] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" - "shrl.ph %[res_op1], %[res_op1], 4 \n\t" - - /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + - q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ - "addu.ph %[res_op0], %[p7], %[p0] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" - "shrl.ph %[res_op0], %[res_op0], 4 \n\t" - - : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), - [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), - [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), - [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), - [add_p6toq6] "r"(add_p6toq6)); - - *op6 = res_op6; - *op5 = res_op5; - *op4 = res_op4; - *op3 = res_op3; - *op2 = res_op2; - *op1 = res_op1; - *op0 = res_op0; - - __asm__ __volatile__( - /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + - q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ - "addu.ph %[res_oq0], %[q7], %[q0] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" - - /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + - q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ - "shll.ph %[tmp], %[q7], 1 \n\t" - "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" - - /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + - q3 + q4 + q5 + q6 + q7 * 3, 4) */ - "shll.ph %[tmp], %[q7], 1 \n\t" - "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" - - /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + - q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" - "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" - - /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + - q4 * 2 + q5 + q6 + q7 * 5, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" - "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" - "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" - - /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + - q5 * 2 + q6 + q7 * 6, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" - "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" - - /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + - q4 + q5 + q6 * 2 + q7 * 7, 4) */ - "shll.ph %[tmp], %[q7], 3 \n\t" - "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" - "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" - "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" - - : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), - [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), - [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), - [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) - : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), - [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), - [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), - [add_p6toq6] "r"(add_p6toq6)); - - *oq0 = res_oq0; - *oq1 = res_oq1; - *oq2 = res_oq2; - *oq3 = res_oq3; - *oq4 = res_oq4; - *oq5 = res_oq5; - *oq6 = res_oq6; -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h deleted file mode 100644 index 62295d69d..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ /dev/null @@ -1,437 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -#define STORE_F0() \ - { \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s4]) \n\t" \ - "sb %[q0_f0], 0(%[s4]) \n\t" \ - "sb %[p0_f0], -1(%[s4]) \n\t" \ - "sb %[p1_f0], -2(%[s4]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s3]) \n\t" \ - "sb %[q0_f0], 0(%[s3]) \n\t" \ - "sb %[p0_f0], -1(%[s3]) \n\t" \ - "sb %[p1_f0], -2(%[s3]) \n\t" \ - \ - : [p1_f0] "+r"(p1_f0) \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ - [p0_f0] "r"(p0_f0)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s2]) \n\t" \ - "sb %[q0_f0], 0(%[s2]) \n\t" \ - "sb %[p0_f0], -1(%[s2]) \n\t" \ - "sb %[p1_f0], -2(%[s2]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s1]) \n\t" \ - "sb %[q0_f0], 0(%[s1]) \n\t" \ - "sb %[p0_f0], -1(%[s1]) \n\t" \ - "sb %[p1_f0], -2(%[s1]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ - } - -#define STORE_F1() \ - { \ - __asm__ __volatile__( \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - \ - : \ - : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ - [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - \ - : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ - [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - \ - : \ - : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ - [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ - \ - __asm__ __volatile__( \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - \ - : \ - : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ - [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - \ - : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ - [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - \ - : \ - : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ - [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ - } - -#define STORE_F2() \ - { \ - __asm__ __volatile__( \ - "sb %[q6_r], 6(%[s4]) \n\t" \ - "sb %[q5_r], 5(%[s4]) \n\t" \ - "sb %[q4_r], 4(%[s4]) \n\t" \ - "sb %[q3_r], 3(%[s4]) \n\t" \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - "sb %[p3_r], -4(%[s4]) \n\t" \ - "sb %[p4_r], -5(%[s4]) \n\t" \ - "sb %[p5_r], -6(%[s4]) \n\t" \ - "sb %[p6_r], -7(%[s4]) \n\t" \ - \ - : \ - : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ - [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ - [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ - [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ - [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q6_r], %[q6_r], 16 \n\t" \ - "srl %[q5_r], %[q5_r], 16 \n\t" \ - "srl %[q4_r], %[q4_r], 16 \n\t" \ - "srl %[q3_r], %[q3_r], 16 \n\t" \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - "srl %[p3_r], %[p3_r], 16 \n\t" \ - "srl %[p4_r], %[p4_r], 16 \n\t" \ - "srl %[p5_r], %[p5_r], 16 \n\t" \ - "srl %[p6_r], %[p6_r], 16 \n\t" \ - \ - : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ - [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ - [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ - [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ - [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q6_r], 6(%[s3]) \n\t" \ - "sb %[q5_r], 5(%[s3]) \n\t" \ - "sb %[q4_r], 4(%[s3]) \n\t" \ - "sb %[q3_r], 3(%[s3]) \n\t" \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - "sb %[p3_r], -4(%[s3]) \n\t" \ - "sb %[p4_r], -5(%[s3]) \n\t" \ - "sb %[p5_r], -6(%[s3]) \n\t" \ - "sb %[p6_r], -7(%[s3]) \n\t" \ - \ - : \ - : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ - [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ - [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ - [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ - [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ - \ - __asm__ __volatile__( \ - "sb %[q6_l], 6(%[s2]) \n\t" \ - "sb %[q5_l], 5(%[s2]) \n\t" \ - "sb %[q4_l], 4(%[s2]) \n\t" \ - "sb %[q3_l], 3(%[s2]) \n\t" \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - "sb %[p3_l], -4(%[s2]) \n\t" \ - "sb %[p4_l], -5(%[s2]) \n\t" \ - "sb %[p5_l], -6(%[s2]) \n\t" \ - "sb %[p6_l], -7(%[s2]) \n\t" \ - \ - : \ - : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ - [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ - [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ - [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ - [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q6_l], %[q6_l], 16 \n\t" \ - "srl %[q5_l], %[q5_l], 16 \n\t" \ - "srl %[q4_l], %[q4_l], 16 \n\t" \ - "srl %[q3_l], %[q3_l], 16 \n\t" \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - "srl %[p3_l], %[p3_l], 16 \n\t" \ - "srl %[p4_l], %[p4_l], 16 \n\t" \ - "srl %[p5_l], %[p5_l], 16 \n\t" \ - "srl %[p6_l], %[p6_l], 16 \n\t" \ - \ - : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ - [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ - [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ - [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ - [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q6_l], 6(%[s1]) \n\t" \ - "sb %[q5_l], 5(%[s1]) \n\t" \ - "sb %[q4_l], 4(%[s1]) \n\t" \ - "sb %[q3_l], 3(%[s1]) \n\t" \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - "sb %[p3_l], -4(%[s1]) \n\t" \ - "sb %[p4_l], -5(%[s1]) \n\t" \ - "sb %[p5_l], -6(%[s1]) \n\t" \ - "sb %[p6_l], -7(%[s1]) \n\t" \ - \ - : \ - : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ - [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ - [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ - [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ - [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ - } - -#define PACK_LEFT_0TO3() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ - "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ - "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ - "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ - "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ - "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ - "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ - "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ - \ - : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ - [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ - [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ - : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ - } - -#define PACK_LEFT_4TO7() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ - "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ - "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ - "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ - "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ - "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ - "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ - "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ - \ - : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ - [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ - [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ - [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ - } - -#define PACK_RIGHT_0TO3() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ - "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ - "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ - "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ - "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ - "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ - "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ - "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ - \ - : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ - [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ - [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ - : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ - } - -#define PACK_RIGHT_4TO7() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ - "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ - "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ - "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ - "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ - "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ - "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ - "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ - \ - : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ - [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ - [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ - [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ - } - -#define COMBINE_LEFT_RIGHT_0TO2() \ - { \ - __asm__ __volatile__( \ - "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ - "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ - "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ - "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ - "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ - "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ - \ - : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ - [q1] "=&r"(q1), [q2] "=&r"(q2) \ - : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ - [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ - [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ - [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ - } - -#define COMBINE_LEFT_RIGHT_3TO6() \ - { \ - __asm__ __volatile__( \ - "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ - "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ - "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ - "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ - "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ - "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ - "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ - "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ - \ - : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ - [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ - [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ - [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ - [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ - [q6_r] "r"(q6_r)); \ - } - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h deleted file mode 100644 index a0f57f386..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* processing 4 pixels at the same time - * compute hev and mask in the same function */ -static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, - uint32_t p1, uint32_t p0, uint32_t p3, - uint32_t p2, uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, - uint32_t thresh, uint32_t *hev, - uint32_t *mask) { - uint32_t c, r, r3, r_k; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t hev1; - - __asm__ __volatile__( - /* mask |= (abs(p3 - p2) > limit) */ - "subu_s.qb %[c], %[p3], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* mask |= (abs(p2 - p1) > limit) */ - "subu_s.qb %[c], %[p2], %[p1] \n\t" - "subu_s.qb %[r_k], %[p1], %[p2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(p1 - p0) > limit) - * hev |= (abs(p1 - p0) > thresh) - */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], $0, %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(q1 - q0) > limit) - * hev |= (abs(q1 - q0) > thresh) - */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], %[r3], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(q2 - q1) > limit) */ - "subu_s.qb %[c], %[q2], %[q1] \n\t" - "subu_s.qb %[r_k], %[q1], %[q2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r3], %[r3], 24 \n\t" - - /* mask |= (abs(q3 - q2) > limit) */ - "subu_s.qb %[c], %[q3], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) - : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), - [thresh] "r"(thresh)); - - __asm__ __volatile__( - /* abs(p0 - q0) */ - "subu_s.qb %[c], %[p0], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[p0] \n\t" - "wrdsp %[r3] \n\t" - "or %[s1], %[r_k], %[c] \n\t" - - /* abs(p1 - q1) */ - "subu_s.qb %[c], %[p1], %[q1] \n\t" - "addu_s.qb %[s3], %[s1], %[s1] \n\t" - "pick.qb %[hev1], %[ones], $0 \n\t" - "subu_s.qb %[r_k], %[q1], %[p1] \n\t" - "or %[s2], %[r_k], %[c] \n\t" - - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ - "shrl.qb %[s2], %[s2], 1 \n\t" - "addu_s.qb %[s1], %[s2], %[s3] \n\t" - "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - - "wrdsp %[r] \n\t" - "pick.qb %[s2], $0, %[ones] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), - [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), - [ones] "r"(ones), [flimit] "r"(flimit)); - - *hev = hev1; - *mask = s2; -} - -static INLINE void filter_hev_mask_flatmask4_dspr2( - uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, - uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, - uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { - uint32_t c, r, r3, r_k, r_flat; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t hev1; - uint32_t flat1; - - __asm__ __volatile__( - /* mask |= (abs(p3 - p2) > limit) */ - "subu_s.qb %[c], %[p3], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* mask |= (abs(p2 - p1) > limit) */ - "subu_s.qb %[c], %[p2], %[p1] \n\t" - "subu_s.qb %[r_k], %[p1], %[p2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(p1 - p0) > limit) - * hev |= (abs(p1 - p0) > thresh) - * flat |= (abs(p1 - p0) > thresh) - */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], $0, %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], $0, %[c] \n\t" - - /* mask |= (abs(q1 - q0) > limit) - * hev |= (abs(q1 - q0) > thresh) - * flat |= (abs(q1 - q0) > thresh) - */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], %[r3], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p0 - p2) > thresh) */ - "subu_s.qb %[c], %[p0], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q0 - q2) > thresh) */ - "subu_s.qb %[c], %[q0], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p3 - p0) > thresh) */ - "subu_s.qb %[c], %[p3], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q3 - q0) > thresh) */ - "subu_s.qb %[c], %[q3], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - "sll %[r_flat], %[r_flat], 24 \n\t" - /* look at stall here */ - "wrdsp %[r_flat] \n\t" - "pick.qb %[flat1], $0, %[ones] \n\t" - - /* mask |= (abs(q2 - q1) > limit) */ - "subu_s.qb %[c], %[q2], %[q1] \n\t" - "subu_s.qb %[r_k], %[q1], %[q2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r3], %[r3], 24 \n\t" - - /* mask |= (abs(q3 - q2) > limit) */ - "subu_s.qb %[c], %[q3], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), - [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) - : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), - [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); - - __asm__ __volatile__( - /* abs(p0 - q0) */ - "subu_s.qb %[c], %[p0], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[p0] \n\t" - "wrdsp %[r3] \n\t" - "or %[s1], %[r_k], %[c] \n\t" - - /* abs(p1 - q1) */ - "subu_s.qb %[c], %[p1], %[q1] \n\t" - "addu_s.qb %[s3], %[s1], %[s1] \n\t" - "pick.qb %[hev1], %[ones], $0 \n\t" - "subu_s.qb %[r_k], %[q1], %[p1] \n\t" - "or %[s2], %[r_k], %[c] \n\t" - - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ - "shrl.qb %[s2], %[s2], 1 \n\t" - "addu_s.qb %[s1], %[s2], %[s3] \n\t" - "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - - "wrdsp %[r] \n\t" - "pick.qb %[s2], $0, %[ones] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), - [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), - [ones] "r"(ones), [flimit] "r"(flimit)); - - *hev = hev1; - *mask = s2; - *flat = flat1; -} - -static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, - uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, - uint32_t q3, uint32_t q4, uint32_t *flat2) { - uint32_t c, r, r_k, r_flat; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t flat1, flat3; - - __asm__ __volatile__( - /* flat |= (abs(p4 - p0) > thresh) */ - "subu_s.qb %[c], %[p4], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p4] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* flat |= (abs(q4 - q0) > thresh) */ - "subu_s.qb %[c], %[q4], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q4] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - "wrdsp %[r] \n\t" - "pick.qb %[flat3], $0, %[ones] \n\t" - - /* flat |= (abs(p1 - p0) > thresh) */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], $0, %[c] \n\t" - - /* flat |= (abs(q1 - q0) > thresh) */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p0 - p2) > thresh) */ - "subu_s.qb %[c], %[p0], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q0 - q2) > thresh) */ - "subu_s.qb %[c], %[q0], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p3 - p0) > thresh) */ - "subu_s.qb %[c], %[p3], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q3 - q0) > thresh) */ - "subu_s.qb %[c], %[q3], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - "sll %[r_flat], %[r_flat], 24 \n\t" - "wrdsp %[r_flat] \n\t" - "pick.qb %[flat1], $0, %[ones] \n\t" - /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ - "and %[flat1], %[flat3], %[flat1] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), - [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), - [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); - - *flat2 = flat1; -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c deleted file mode 100644 index b67ccfe9d..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c +++ /dev/null @@ -1,590 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint32_t mask; - uint32_t hev, flat; - uint8_t i; - uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p3, p2, p1, p0, q0, q1, q2, q3; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - for (i = 0; i < 2; i++) { - sp3 = s - (pitch << 2); - sp2 = sp3 + pitch; - sp1 = sp2 + pitch; - sp0 = sp1 + pitch; - sq0 = s; - sq1 = s + pitch; - sq2 = sq1 + pitch; - sq3 = sq2 + pitch; - - __asm__ __volatile__( - "lw %[p3], (%[sp3]) \n\t" - "lw %[p2], (%[sp2]) \n\t" - "lw %[p1], (%[sp1]) \n\t" - "lw %[p0], (%[sp0]) \n\t" - "lw %[q0], (%[sq0]) \n\t" - "lw %[q1], (%[sq1]) \n\t" - "lw %[q2], (%[sq2]) \n\t" - "lw %[q3], (%[sq3]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) - : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - __asm__ __volatile__( - "sw %[p1_f0], (%[sp1]) \n\t" - "sw %[p0_f0], (%[sp0]) \n\t" - "sw %[q0_f0], (%[sq0]) \n\t" - "sw %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1)); - } else if ((mask & flat) == 0xFFFFFFFF) { - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - COMBINE_LEFT_RIGHT_0TO2() - - __asm__ __volatile__( - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - - : - : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), - [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), - [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if ((flat != 0) && (mask != 0)) { - /* filtering */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), - [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), - [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } - - s = s + 4; - } -} - -void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p3, p2, p1, p0, q3, q2, q1, q0; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - __asm__ __volatile__( - "lw %[p0], -4(%[s1]) \n\t" - "lw %[p1], -4(%[s2]) \n\t" - "lw %[p2], -4(%[s3]) \n\t" - "lw %[p3], -4(%[s4]) \n\t" - "lw %[q3], (%[s1]) \n\t" - "lw %[q2], (%[s2]) \n\t" - "lw %[q1], (%[s3]) \n\t" - "lw %[q0], (%[s4]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* transpose p3, p2, p1, p0 - original (when loaded from memory) - register -4 -3 -2 -1 - p0 p0_0 p0_1 p0_2 p0_3 - p1 p1_0 p1_1 p1_2 p1_3 - p2 p2_0 p2_1 p2_2 p2_3 - p3 p3_0 p3_1 p3_2 p3_3 - - after transpose - register - p0 p3_3 p2_3 p1_3 p0_3 - p1 p3_2 p2_2 p1_2 p0_2 - p2 p3_1 p2_1 p1_1 p0_1 - p3 p3_0 p2_0 p1_0 p0_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q0, q1, q2, q3 - original (when loaded from memory) - register +1 +2 +3 +4 - q3 q3_0 q3_1 q3_2 q3_3 - q2 q2_0 q2_1 q2_2 q2_3 - q1 q1_0 q1_1 q1_2 q1_3 - q0 q0_0 q0_1 q0_2 q0_3 - - after transpose - register - q3 q0_3 q1_3 q2_3 q3_3 - q2 q0_2 q1_2 q2_2 q3_2 - q1 q0_1 q1_1 q2_1 q3_1 - q0 q0_0 q1_0 q2_0 q3_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" - "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" - "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" - "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" - - "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" - "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" - "append %[q2], %[sec3], 16 \n\t" - "append %[q0], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), - [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - STORE_F0() - } else if ((mask & flat) == 0xFFFFFFFF) { - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - STORE_F1() - } else if ((flat != 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), - [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), - [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], +1(%[s1]) \n\t" - "sb %[q2_l], +2(%[s1]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c deleted file mode 100644 index 34733e42e..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ /dev/null @@ -1,734 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count) { - uint32_t mask; - uint32_t hev, flat, flat2; - uint8_t i; - uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; - uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - for (i = 0; i < (2 * count); i++) { - sp7 = s - (pitch << 3); - sp6 = sp7 + pitch; - sp5 = sp6 + pitch; - sp4 = sp5 + pitch; - sp3 = sp4 + pitch; - sp2 = sp3 + pitch; - sp1 = sp2 + pitch; - sp0 = sp1 + pitch; - sq0 = s; - sq1 = s + pitch; - sq2 = sq1 + pitch; - sq3 = sq2 + pitch; - sq4 = sq3 + pitch; - sq5 = sq4 + pitch; - sq6 = sq5 + pitch; - sq7 = sq6 + pitch; - - __asm__ __volatile__( - "lw %[p7], (%[sp7]) \n\t" - "lw %[p6], (%[sp6]) \n\t" - "lw %[p5], (%[sp5]) \n\t" - "lw %[p4], (%[sp4]) \n\t" - "lw %[p3], (%[sp3]) \n\t" - "lw %[p2], (%[sp2]) \n\t" - "lw %[p1], (%[sp1]) \n\t" - "lw %[p0], (%[sp0]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) - : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); - - __asm__ __volatile__( - "lw %[q0], (%[sq0]) \n\t" - "lw %[q1], (%[sq1]) \n\t" - "lw %[q2], (%[sq2]) \n\t" - "lw %[q3], (%[sq3]) \n\t" - "lw %[q4], (%[sq4]) \n\t" - "lw %[q5], (%[sq5]) \n\t" - "lw %[q6], (%[sq6]) \n\t" - "lw %[q7], (%[sq7]) \n\t" - - : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), - [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) - : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), - [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); - - /* f0 */ - if (((flat2 == 0) && (flat == 0) && (mask != 0)) || - ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - __asm__ __volatile__( - "sw %[p1_f0], (%[sp1]) \n\t" - "sw %[p0_f0], (%[sp0]) \n\t" - "sw %[q0_f0], (%[sq0]) \n\t" - "sw %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1)); - } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && - (mask == 0xFFFFFFFF)) { - /* f2 */ - PACK_LEFT_0TO3() - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_0TO3() - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - COMBINE_LEFT_RIGHT_0TO2() - COMBINE_LEFT_RIGHT_3TO6() - - __asm__ __volatile__( - "sw %[p6], (%[sp6]) \n\t" - "sw %[p5], (%[sp5]) \n\t" - "sw %[p4], (%[sp4]) \n\t" - "sw %[p3], (%[sp3]) \n\t" - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - - : - : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), - [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), - [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sw %[q6], (%[sq6]) \n\t" - "sw %[q5], (%[sq5]) \n\t" - "sw %[q4], (%[sq4]) \n\t" - "sw %[q3], (%[sq3]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - - : - : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), - [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), - [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), - [sq1] "r"(sq1), [sq0] "r"(sq0)); - } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { - /* f1 */ - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - COMBINE_LEFT_RIGHT_0TO2() - - __asm__ __volatile__( - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - - : - : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), - [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), - [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { - /* f0+f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { - /* f0 + f1 + f2 */ - /* f0 function */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* f1 function */ - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, - &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, - &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); - - /* f2 function */ - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__( - "sb %[p6_r], (%[sp6]) \n\t" - "sb %[p5_r], (%[sp5]) \n\t" - "sb %[p4_r], (%[sp4]) \n\t" - "sb %[p3_r], (%[sp3]) \n\t" - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), - [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - "sb %[q3_r], (%[sq3]) \n\t" - "sb %[q4_r], (%[sq4]) \n\t" - "sb %[q5_r], (%[sq5]) \n\t" - "sb %[q6_r], (%[sq6]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r_f1], (%[sp2]) \n\t" - "sb %[p1_r_f1], (%[sp1]) \n\t" - "sb %[p0_r_f1], (%[sp0]) \n\t" - "sb %[q0_r_f1], (%[sq0]) \n\t" - "sb %[q1_r_f1], (%[sq1]) \n\t" - "sb %[q2_r_f1], (%[sq2]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p6_r], %[p6_r], 16 \n\t" - "srl %[p5_r], %[p5_r], 16 \n\t" - "srl %[p4_r], %[p4_r], 16 \n\t" - "srl %[p3_r], %[p3_r], 16 \n\t" - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[q3_r], %[q3_r], 16 \n\t" - "srl %[q4_r], %[q4_r], 16 \n\t" - "srl %[q5_r], %[q5_r], 16 \n\t" - "srl %[q6_r], %[q6_r], 16 \n\t" - - : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), - [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), - [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), - [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) - :); - - __asm__ __volatile__( - "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" - "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" - "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" - "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" - "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" - "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), - [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), - [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p6_r], +1(%[sp6]) \n\t" - "sb %[p5_r], +1(%[sp5]) \n\t" - "sb %[p4_r], +1(%[sp4]) \n\t" - "sb %[p3_r], +1(%[sp3]) \n\t" - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - "sb %[q3_r], +1(%[sq3]) \n\t" - "sb %[q4_r], +1(%[sq4]) \n\t" - "sb %[q5_r], +1(%[sq5]) \n\t" - "sb %[q6_r], +1(%[sq6]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r_f1], +1(%[sp2]) \n\t" - "sb %[p1_r_f1], +1(%[sp1]) \n\t" - "sb %[p0_r_f1], +1(%[sp0]) \n\t" - "sb %[q0_r_f1], +1(%[sq0]) \n\t" - "sb %[q1_r_f1], +1(%[sq1]) \n\t" - "sb %[q2_r_f1], +1(%[sq2]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p6_l], +2(%[sp6]) \n\t" - "sb %[p5_l], +2(%[sp5]) \n\t" - "sb %[p4_l], +2(%[sp4]) \n\t" - "sb %[p3_l], +2(%[sp3]) \n\t" - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - "sb %[q3_l], +2(%[sq3]) \n\t" - "sb %[q4_l], +2(%[sq4]) \n\t" - "sb %[q5_l], +2(%[sq5]) \n\t" - "sb %[q6_l], +2(%[sq6]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l_f1], +2(%[sp2]) \n\t" - "sb %[p1_l_f1], +2(%[sp1]) \n\t" - "sb %[p0_l_f1], +2(%[sp0]) \n\t" - "sb %[q0_l_f1], +2(%[sq0]) \n\t" - "sb %[q1_l_f1], +2(%[sq1]) \n\t" - "sb %[q2_l_f1], +2(%[sq2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p6_l], %[p6_l], 16 \n\t" - "srl %[p5_l], %[p5_l], 16 \n\t" - "srl %[p4_l], %[p4_l], 16 \n\t" - "srl %[p3_l], %[p3_l], 16 \n\t" - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[q3_l], %[q3_l], 16 \n\t" - "srl %[q4_l], %[q4_l], 16 \n\t" - "srl %[q5_l], %[q5_l], 16 \n\t" - "srl %[q6_l], %[q6_l], 16 \n\t" - - : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), - [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), - [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), - [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) - :); - - __asm__ __volatile__( - "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" - "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" - "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" - "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" - "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" - "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), - [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), - [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__( - "sb %[p6_l], +3(%[sp6]) \n\t" - "sb %[p5_l], +3(%[sp5]) \n\t" - "sb %[p4_l], +3(%[sp4]) \n\t" - "sb %[p3_l], +3(%[sp3]) \n\t" - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - "sb %[q3_l], +3(%[sq3]) \n\t" - "sb %[q4_l], +3(%[sq4]) \n\t" - "sb %[q5_l], +3(%[sq5]) \n\t" - "sb %[q6_l], +3(%[sq6]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), - [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); - } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l_f1], +3(%[sp2]) \n\t" - "sb %[p1_l_f1], +3(%[sp1]) \n\t" - "sb %[p0_l_f1], +3(%[sp0]) \n\t" - "sb %[q0_l_f1], +3(%[sq0]) \n\t" - "sb %[q1_l_f1], +3(%[sq1]) \n\t" - "sb %[q2_l_f1], +3(%[sq2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } - - s = s + 4; - } -} - -void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); -} - -void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c deleted file mode 100644 index 3d3f1ec97..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c +++ /dev/null @@ -1,758 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat, flat2; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - __asm__ __volatile__( - "lw %[p0], -4(%[s1]) \n\t" - "lw %[p1], -4(%[s2]) \n\t" - "lw %[p2], -4(%[s3]) \n\t" - "lw %[p3], -4(%[s4]) \n\t" - "lw %[p4], -8(%[s1]) \n\t" - "lw %[p5], -8(%[s2]) \n\t" - "lw %[p6], -8(%[s3]) \n\t" - "lw %[p7], -8(%[s4]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - __asm__ __volatile__( - "lw %[q3], (%[s1]) \n\t" - "lw %[q2], (%[s2]) \n\t" - "lw %[q1], (%[s3]) \n\t" - "lw %[q0], (%[s4]) \n\t" - "lw %[q7], +4(%[s1]) \n\t" - "lw %[q6], +4(%[s2]) \n\t" - "lw %[q5], +4(%[s3]) \n\t" - "lw %[q4], +4(%[s4]) \n\t" - - : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), - [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* transpose p3, p2, p1, p0 - original (when loaded from memory) - register -4 -3 -2 -1 - p0 p0_0 p0_1 p0_2 p0_3 - p1 p1_0 p1_1 p1_2 p1_3 - p2 p2_0 p2_1 p2_2 p2_3 - p3 p3_0 p3_1 p3_2 p3_3 - - after transpose - register - p0 p3_3 p2_3 p1_3 p0_3 - p1 p3_2 p2_2 p1_2 p0_2 - p2 p3_1 p2_1 p1_1 p0_1 - p3 p3_0 p2_0 p1_0 p0_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q0, q1, q2, q3 - original (when loaded from memory) - register +1 +2 +3 +4 - q3 q3_0 q3_1 q3_2 q3_3 - q2 q2_0 q2_1 q2_2 q2_3 - q1 q1_0 q1_1 q1_2 q1_3 - q0 q0_0 q0_1 q0_2 q0_3 - - after transpose - register - q3 q0_3 q1_3 q2_3 q3_3 - q2 q0_2 q1_2 q2_2 q3_2 - q1 q0_1 q1_1 q2_1 q3_1 - q0 q0_0 q1_0 q2_0 q3_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" - "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" - "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" - "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" - - "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" - "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" - "append %[q2], %[sec3], 16 \n\t" - "append %[q0], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), - [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose p7, p6, p5, p4 - original (when loaded from memory) - register -8 -7 -6 -5 - p4 p4_0 p4_1 p4_2 p4_3 - p5 p5_0 p5_1 p5_2 p5_3 - p6 p6_0 p6_1 p6_2 p6_3 - p7 p7_0 p7_1 p7_2 p7_3 - - after transpose - register - p4 p7_3 p6_3 p5_3 p4_3 - p5 p7_2 p6_2 p5_2 p4_2 - p6 p7_1 p6_1 p5_1 p4_1 - p7 p7_0 p6_0 p5_0 p4_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" - "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" - "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" - "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" - - "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" - "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" - "append %[p5], %[sec3], 16 \n\t" - "append %[p7], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), - [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q4, q5, q6, q7 - original (when loaded from memory) - register +5 +6 +7 +8 - q7 q7_0 q7_1 q7_2 q7_3 - q6 q6_0 q6_1 q6_2 q6_3 - q5 q5_0 q5_1 q5_2 q5_3 - q4 q4_0 q4_1 q4_2 q4_3 - - after transpose - register - q7 q4_3 q5_3 q26_3 q7_3 - q6 q4_2 q5_2 q26_2 q7_2 - q5 q4_1 q5_1 q26_1 q7_1 - q4 q4_0 q5_0 q26_0 q7_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" - "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" - "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" - "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" - - "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" - "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" - "append %[q6], %[sec3], 16 \n\t" - "append %[q4], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), - [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); - - /* f0 */ - if (((flat2 == 0) && (flat == 0) && (mask != 0)) || - ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - STORE_F0() - } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && - (mask == 0xFFFFFFFF)) { - /* f2 */ - PACK_LEFT_0TO3() - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_0TO3() - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - STORE_F2() - } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { - /* f1 */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - STORE_F1() - } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { - /* f0 + f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], +1(%[s1]) \n\t" - "sb %[q2_l], +2(%[s1]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { - /* f0+f1+f2 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, - &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); - - PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, - &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); - - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__( - "sb %[p6_r], -7(%[s4]) \n\t" - "sb %[p5_r], -6(%[s4]) \n\t" - "sb %[p4_r], -5(%[s4]) \n\t" - "sb %[p3_r], -4(%[s4]) \n\t" - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [s4] "r"(s4)); - - __asm__ __volatile__( - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - "sb %[q3_r], +3(%[s4]) \n\t" - "sb %[q4_r], +4(%[s4]) \n\t" - "sb %[q5_r], +5(%[s4]) \n\t" - "sb %[q6_r], +6(%[s4]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [s4] "r"(s4)); - } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r_f1], -3(%[s4]) \n\t" - "sb %[p1_r_f1], -2(%[s4]) \n\t" - "sb %[p0_r_f1], -1(%[s4]) \n\t" - "sb %[q0_r_f1], (%[s4]) \n\t" - "sb %[q1_r_f1], +1(%[s4]) \n\t" - "sb %[q2_r_f1], +2(%[s4]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p6_r], %[p6_r], 16 \n\t" - "srl %[p5_r], %[p5_r], 16 \n\t" - "srl %[p4_r], %[p4_r], 16 \n\t" - "srl %[p3_r], %[p3_r], 16 \n\t" - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[q3_r], %[q3_r], 16 \n\t" - "srl %[q4_r], %[q4_r], 16 \n\t" - "srl %[q5_r], %[q5_r], 16 \n\t" - "srl %[q6_r], %[q6_r], 16 \n\t" - - : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), - [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), - [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), - [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) - :); - - __asm__ __volatile__( - "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" - "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" - "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" - "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" - "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" - "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), - [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), - [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p6_r], -7(%[s3]) \n\t" - "sb %[p5_r], -6(%[s3]) \n\t" - "sb %[p4_r], -5(%[s3]) \n\t" - "sb %[p3_r], -4(%[s3]) \n\t" - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [s3] "r"(s3)); - - __asm__ __volatile__( - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - "sb %[q3_r], +3(%[s3]) \n\t" - "sb %[q4_r], +4(%[s3]) \n\t" - "sb %[q5_r], +5(%[s3]) \n\t" - "sb %[q6_r], +6(%[s3]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [s3] "r"(s3)); - } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r_f1], -3(%[s3]) \n\t" - "sb %[p1_r_f1], -2(%[s3]) \n\t" - "sb %[p0_r_f1], -1(%[s3]) \n\t" - "sb %[q0_r_f1], (%[s3]) \n\t" - "sb %[q1_r_f1], +1(%[s3]) \n\t" - "sb %[q2_r_f1], +2(%[s3]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p6_l], -7(%[s2]) \n\t" - "sb %[p5_l], -6(%[s2]) \n\t" - "sb %[p4_l], -5(%[s2]) \n\t" - "sb %[p3_l], -4(%[s2]) \n\t" - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [s2] "r"(s2)); - - __asm__ __volatile__( - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - "sb %[q3_l], +3(%[s2]) \n\t" - "sb %[q4_l], +4(%[s2]) \n\t" - "sb %[q5_l], +5(%[s2]) \n\t" - "sb %[q6_l], +6(%[s2]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [s2] "r"(s2)); - } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l_f1], -3(%[s2]) \n\t" - "sb %[p1_l_f1], -2(%[s2]) \n\t" - "sb %[p0_l_f1], -1(%[s2]) \n\t" - "sb %[q0_l_f1], (%[s2]) \n\t" - "sb %[q1_l_f1], +1(%[s2]) \n\t" - "sb %[q2_l_f1], +2(%[s2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p6_l], %[p6_l], 16 \n\t" - "srl %[p5_l], %[p5_l], 16 \n\t" - "srl %[p4_l], %[p4_l], 16 \n\t" - "srl %[p3_l], %[p3_l], 16 \n\t" - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[q3_l], %[q3_l], 16 \n\t" - "srl %[q4_l], %[q4_l], 16 \n\t" - "srl %[q5_l], %[q5_l], 16 \n\t" - "srl %[q6_l], %[q6_l], 16 \n\t" - - : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), - [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), - [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), - [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) - :); - - __asm__ __volatile__( - "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" - "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" - "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" - "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" - "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" - "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), - [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), - [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__( - "sb %[p6_l], -7(%[s1]) \n\t" - "sb %[p5_l], -6(%[s1]) \n\t" - "sb %[p4_l], -5(%[s1]) \n\t" - "sb %[p3_l], -4(%[s1]) \n\t" - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [s1] "r"(s1)); - - __asm__ __volatile__( - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], 1(%[s1]) \n\t" - "sb %[q2_l], 2(%[s1]) \n\t" - "sb %[q3_l], 3(%[s1]) \n\t" - "sb %[q4_l], 4(%[s1]) \n\t" - "sb %[q5_l], 5(%[s1]) \n\t" - "sb %[q6_l], 6(%[s1]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [s1] "r"(s1)); - } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l_f1], -3(%[s1]) \n\t" - "sb %[p1_l_f1], -2(%[s1]) \n\t" - "sb %[p0_l_f1], -1(%[s1]) \n\t" - "sb %[q0_l_f1], (%[s1]) \n\t" - "sb %[q1_l_f1], +1(%[s1]) \n\t" - "sb %[q2_l_f1], +2(%[s1]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h deleted file mode 100644 index 54b0bb4bd..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" - -#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - filt = filt & (v16i8)hev_in; \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - /* combine left and right part */ \ - filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ - \ - filt = filt & (v16i8)mask_in; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ - v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp_flat4 = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp_flat4 < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ - } - -#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ - q6_in, q7_in, flat_in, flat2_out) \ - { \ - v16u8 tmp_flat5, zero_in = { 0 }; \ - v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ - v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ - \ - tmp_flat5 = __msa_ori_b(zero_in, 1); \ - p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ - q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ - p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ - q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ - p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ - q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ - p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ - q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ - \ - p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ - flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ - flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ - p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ - flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ - p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ - flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ - \ - flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ - flat2_out = __msa_xori_b(flat2_out, 0xff); \ - flat2_out = flat2_out & flat_in; \ - } - -#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ - p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ - q1_filt8_out, q2_filt8_out) \ - { \ - v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ - \ - tmp_filt8_2 = p2_in + p1_in + p0_in; \ - tmp_filt8_0 = p3_in << 1; \ - \ - tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ - tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = q2_in + q1_in + q0_in; \ - tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ - tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ - tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ - \ - tmp_filt8_0 = q2_in + q3_in; \ - tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ - tmp_filt8_1 = q3_in + q3_in; \ - tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_0 = tmp_filt8_2 + q3_in; \ - tmp_filt8_1 = tmp_filt8_0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = tmp_filt8_0 - p2_in; \ - tmp_filt8_0 = q1_in + q3_in; \ - tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - } - -#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ - limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ - flat_out) \ - { \ - v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ - v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ - \ - /* absolute subtraction of pixel values */ \ - p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ - p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ - p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ - q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ - q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ - q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ - p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ - p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ - \ - /* calculation of hev */ \ - flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ - hev_out = thresh_in < (v16u8)flat_out; \ - \ - /* calculation of mask */ \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ - p1_asub_q1_m >>= 1; \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ - \ - mask_out = b_limit_in < p0_asub_q0_m; \ - mask_out = __msa_max_u_b(flat_out, mask_out); \ - p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ - mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ - q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ - mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ - \ - mask_out = limit_in < (v16u8)mask_out; \ - mask_out = __msa_xori_b(mask_out, 0xff); \ - } -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h deleted file mode 100644 index 9bfc27147..000000000 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ /dev/null @@ -1,2058 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ -#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ - -#include <msa.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) - -#if (__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } -#else // !(__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m_combined = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m_combined = (uint64_t)(val1_m); \ - val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ - val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ - \ - val_m_combined; \ - }) -#endif // (__mips == 64) - -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ - } -#endif // (__mips_isa_rev >= 6) - -/* Description : Load 4 words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1, out2, out3 - Details : Load word in 'out0' from (psrc) - Load word in 'out1' from (psrc + stride) - Load word in 'out2' from (psrc + 2 * stride) - Load word in 'out3' from (psrc + 3 * stride) -*/ -#define LW4(psrc, stride, out0, out1, out2, out3) \ - { \ - out0 = LW((psrc)); \ - out1 = LW((psrc) + stride); \ - out2 = LW((psrc) + 2 * stride); \ - out3 = LW((psrc) + 3 * stride); \ - } - -/* Description : Load double words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load double word in 'out0' from (psrc) - Load double word in 'out1' from (psrc + stride) -*/ -#define LD2(psrc, stride, out0, out1) \ - { \ - out0 = LD((psrc)); \ - out1 = LD((psrc) + stride); \ - } -#define LD4(psrc, stride, out0, out1, out2, out3) \ - { \ - LD2((psrc), stride, out0, out1); \ - LD2((psrc) + 2 * stride, stride, out2, out3); \ - } - -/* Description : Store 4 words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store word from 'in0' to (pdst) - Store word from 'in1' to (pdst + stride) - Store word from 'in2' to (pdst + 2 * stride) - Store word from 'in3' to (pdst + 3 * stride) -*/ -#define SW4(in0, in1, in2, in3, pdst, stride) \ - { \ - SW(in0, (pdst)) \ - SW(in1, (pdst) + stride); \ - SW(in2, (pdst) + 2 * stride); \ - SW(in3, (pdst) + 3 * stride); \ - } - -/* Description : Store 4 double words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store double word from 'in0' to (pdst) - Store double word from 'in1' to (pdst + stride) - Store double word from 'in2' to (pdst + 2 * stride) - Store double word from 'in3' to (pdst + 3 * stride) -*/ -#define SD4(in0, in1, in2, in3, pdst, stride) \ - { \ - SD(in0, (pdst)) \ - SD(in1, (pdst) + stride); \ - SD(in2, (pdst) + 2 * stride); \ - SD(in3, (pdst) + 3 * stride); \ - } - -/* Description : Load vectors with 16 byte elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Load 16 byte elements in 'out0' from (psrc) - Load 16 byte elements in 'out1' from (psrc + stride) -*/ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ - } -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) - -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ - } -#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) - -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) - -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ - { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ - } -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) - -#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ - { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ - } -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) - -#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) - -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ - } -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) - -#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7, out8, out9, out10, out11, out12, out13, out14, out15) \ - { \ - LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ - out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ - out13, out14, out15); \ - } -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) - -/* Description : Load 4x4 block of signed halfword elements from 1D source - data into 4 vectors (Each vector with 4 signed halfwords) - Arguments : Input - psrc - Outputs - out0, out1, out2, out3 -*/ -#define LD4x4_SH(psrc, out0, out1, out2, out3) \ - { \ - out0 = LD_SH(psrc); \ - out2 = LD_SH(psrc + 8); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - } - -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) \ - { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ - } - -/* Description : Store vectors of 16 byte elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 16 byte elements from 'in0' to (pdst) - Store 16 byte elements from 'in1' to (pdst + stride) -*/ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) - -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) - -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) \ - { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ - } - -/* Description : Store 2x4 byte block to destination memory from input vector - Arguments : Inputs - in, stidx, pdst, stride - Details : Index 'stidx' halfword element from 'in' vector is copied to - the GP register and stored to (pdst) - Index 'stidx+1' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + stride) - Index 'stidx+2' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 2 * stride) - Index 'stidx+3' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 3 * stride) -*/ -#define ST2x4_UB(in, stidx, pdst, stride) \ - { \ - uint16_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ - out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ - out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ - out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ - \ - SH(out0_m, pblk_2x4_m); \ - SH(out1_m, pblk_2x4_m + stride); \ - SH(out2_m, pblk_2x4_m + 2 * stride); \ - SH(out3_m, pblk_2x4_m + 3 * stride); \ - } - -/* Description : Store 4x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 word element from 'in' vector is copied to the GP - register and stored to (pdst) - Index 1 word element from 'in' vector is copied to the GP - register and stored to (pdst + stride) -*/ -#define ST4x2_UB(in, pdst, stride) \ - { \ - uint32_t out0_m, out1_m; \ - uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in, 0); \ - out1_m = __msa_copy_u_w((v4i32)in, 1); \ - \ - SW(out0_m, pblk_4x2_m); \ - SW(out1_m, pblk_4x2_m + stride); \ - } - -/* Description : Store 4x4 byte block to destination memory from input vector - Arguments : Inputs - in0, in1, pdst, stride - Details : 'Idx0' word element from input vector 'in0' is copied to the - GP register and stored to (pdst) - 'Idx1' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + stride) - 'Idx2' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 2 * stride) - 'Idx3' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ - { \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ - out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ - out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ - out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ - \ - SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ - } -#define ST4x8_UB(in0, in1, pdst, stride) \ - { \ - uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ - \ - ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ - ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ - } - -/* Description : Store 8x1 byte block to destination memory from input vector - Arguments : Inputs - in, pdst - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) -*/ -#define ST8x1_UB(in, pdst) \ - { \ - uint64_t out0_m; \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - SD(out0_m, pdst); \ - } - -/* Description : Store 8x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in' vector is copied to the - GP register and stored to (pdst + stride) -*/ -#define ST8x2_UB(in, pdst, stride) \ - { \ - uint64_t out0_m, out1_m; \ - uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - out1_m = __msa_copy_u_d((v2i64)in, 1); \ - \ - SD(out0_m, pblk_8x2_m); \ - SD(out1_m, pblk_8x2_m + stride); \ - } - -/* Description : Store 8x4 byte block to destination memory from input - vectors - Arguments : Inputs - in0, in1, pdst, stride - Details : Index 0 double word element from 'in0' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in0' vector is copied to the - GP register and stored to (pdst + stride) - Index 0 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 2 * stride) - Index 1 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST8x4_UB(in0, in1, pdst, stride) \ - { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in0, 0); \ - out1_m = __msa_copy_u_d((v2i64)in0, 1); \ - out2_m = __msa_copy_u_d((v2i64)in1, 0); \ - out3_m = __msa_copy_u_d((v2i64)in1, 1); \ - \ - SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ - } - -/* Description : average with rounding (in0 + in1 + 1) / 2. - Arguments : Inputs - in0, in1, in2, in3, - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned byte element from 'in0' vector is added with - each unsigned byte element from 'in1' vector. Then the average - with rounding is calculated and written to 'out0' -*/ -#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ - out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ - } -#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) - -#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ - } -#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide with zero - Arguments : Inputs - in0, in1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'zero_m' vector are slid into 'in0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ - { \ - v16i8 zero_m = { 0 }; \ - out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ - } -#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) - -#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ - slide_val) \ - { \ - SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ - SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ - } -#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide - Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - { \ - out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ - } -#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) -#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) - -#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ - out2, slide_val) \ - { \ - SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ - } -#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) -#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) - -/* Description : Shuffle byte vector elements as per mask vector - Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0' & 'in1' are copied selectively to - 'out0' as per control vector 'mask0' -*/ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ - } -#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) -#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) -#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) - -#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ - out3) \ - { \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ - } -#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) -#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Unsigned byte elements from 'mult0' are multiplied with - unsigned byte elements from 'cnst0' producing a result - twice the size of input i.e. unsigned halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ - out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ - } -#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) - -#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ - } -#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) - -#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ - } -#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) - -#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) - -/* Description : Dot product of word vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed word elements from 'mult0' are multiplied with - signed word elements from 'cnst0' producing a result - twice the size of input i.e. signed double word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ - } -#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) - -/* Description : Dot product & addition of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ - } -#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) - -#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product & addition of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ - } -#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) - -/* Description : Dot product & addition of double word vector elements - Arguments : Inputs - mult0, mult1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed word element from 'mult0' is multiplied with itself - producing an intermediate result twice the size of input - i.e. signed double word - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ - out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ - } -#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) - -/* Description : Minimum values between unsigned elements of - either vector are copied to the output vector - Arguments : Inputs - in0, in1, min_vec - Outputs - in place operation - Return Type - as per RTYPE - Details : Minimum of unsigned halfword element values from 'in0' and - 'min_vec' are written to output vector 'in0' -*/ -#define MIN_UH2(RTYPE, in0, in1, min_vec) \ - { \ - in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ - in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ - } -#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) - -#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ - { \ - MIN_UH2(RTYPE, in0, in1, min_vec); \ - MIN_UH2(RTYPE, in2, in3, min_vec); \ - } -#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) - -/* Description : Clips all signed halfword elements of input vector - between 0 & 255 - Arguments : Input - in - Output - out_m - Return Type - signed halfword -*/ -#define CLIP_SH_0_255(in) \ - ({ \ - v8i16 max_m = __msa_ldi_h(255); \ - v8i16 out_m; \ - \ - out_m = __msa_maxi_s_h((v8i16)in, 0); \ - out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ - out_m; \ - }) -#define CLIP_SH2_0_255(in0, in1) \ - { \ - in0 = CLIP_SH_0_255(in0); \ - in1 = CLIP_SH_0_255(in1); \ - } -#define CLIP_SH4_0_255(in0, in1, in2, in3) \ - { \ - CLIP_SH2_0_255(in0, in1); \ - CLIP_SH2_0_255(in2, in3); \ - } - -/* Description : Horizontal addition of 4 signed word elements of input vector - Arguments : Input - in (signed word vector) - Output - sum_m (i32 sum) - Return Type - signed word (GP) - Details : 4 signed word elements of 'in' vector are added together and - the resulting integer sum is returned -*/ -#define HADD_SW_S32(in) \ - ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ - }) - -/* Description : Horizontal addition of 8 unsigned halfword elements - Arguments : Inputs - in (unsigned halfword vector) - Outputs - sum_m (u32 sum) - Return Type - unsigned word - Details : 8 unsigned halfword elements of input vector are added - together and the resulting integer sum is returned -*/ -#define HADD_UH_U32(in) \ - ({ \ - v4u32 res_m; \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ - }) - -/* Description : Horizontal addition of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is added to - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ - } -#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) - -#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - HADD_UB2(RTYPE, in0, in1, out0, out1); \ - HADD_UB2(RTYPE, in2, in3, out2, out3); \ - } -#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) - -/* Description : Horizontal subtraction of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is subtracted from - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ - } -#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) - -/* Description : SAD (Sum of Absolute Difference) - Arguments : Inputs - in0, in1, ref0, ref1 - Outputs - sad_m (halfword vector) - Return Type - unsigned halfword - Details : Absolute difference of all the byte elements from 'in0' with - 'ref0' is calculated and preserved in 'diff0'. Then even-odd - pairs are added together to generate 8 halfword results. -*/ -#define SAD_UB2_UH(in0, in1, ref0, ref1) \ - ({ \ - v16u8 diff0_m, diff1_m; \ - v8u16 sad_m = { 0 }; \ - \ - diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ - diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ - \ - sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ - sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ - \ - sad_m; \ - }) - -/* Description : Horizontal subtraction of signed halfword vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed odd halfword element from 'in0' is subtracted from - even signed halfword element from 'in0' (pairwise) and the - word result is written to 'out0' -*/ -#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ - out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ - } -#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) - -/* Description : Set element n input vector to GPR value - Arguments : Inputs - in0, in1, in2, in3 - Output - out - Return Type - as per RTYPE - Details : Set element 0 in vector 'out' to value specified in 'in0' -*/ -#define INSERT_W2(RTYPE, in0, in1, out) \ - { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - } -#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) - -#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ - { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ - } -#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) -#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) - -#define INSERT_D2(RTYPE, in0, in1, out) \ - { \ - out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ - out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ - } -#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) -#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) - -/* Description : Interleave even byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ - } -#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) -#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) - -/* Description : Interleave even halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ - out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ - } -#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) -#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) -#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) - -/* Description : Interleave even word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ - out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ - } -#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) - -/* Description : Interleave even double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ - out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ - } -#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) - -/* Description : Interleave left half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of byte elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ - } -#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) -#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) -#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) -#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) - -#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) -#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) -#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) - -/* Description : Interleave left half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ - } -#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) -#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) - -/* Description : Interleave left half of word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of word elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ - } -#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) -#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) - -/* Description : Interleave right half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements of 'in0' and 'in1' are interleaved - and written to out0. -*/ -#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ - } -#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) -#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) -#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) -#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) - -#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) -#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) -#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) -#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) - -#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ - in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ - out5, out6, out7) \ - { \ - ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3); \ - ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ - out6, out7); \ - } -#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) - -/* Description : Interleave right half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ - } -#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) -#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) - -#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) - -#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ - } -#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) -#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) - -#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) - -/* Description : Interleave right half of double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of double word elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ - out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ - } -#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) -#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) -#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) - -#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ - { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ - } -#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) - -#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) -#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) - -/* Description : Interleave both left and right half of input vectors - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and written to 'out0' -*/ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - } -#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) -#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) -#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) -#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) - -#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - } -#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) -#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) - -#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - } -#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) -#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) -#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range. - The results are written in place -*/ -#define SAT_UH2(RTYPE, in0, in1, sat_val) \ - { \ - in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ - } -#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) - -#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ - { \ - SAT_UH2(RTYPE, in0, in1, sat_val); \ - SAT_UH2(RTYPE, in2, in3, sat_val) \ - } -#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range - The results are written in place -*/ -#define SAT_SH2(RTYPE, in0, in1, sat_val) \ - { \ - in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ - } -#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) - -#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ - { \ - SAT_SH2(RTYPE, in0, in1, sat_val); \ - SAT_SH2(RTYPE, in2, in3, sat_val); \ - } -#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) - -/* Description : Indexed halfword element values are replicated to all - elements in output vector - Arguments : Inputs - in, idx0, idx1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : 'idx0' element value from 'in' vector is replicated to all - elements in 'out0' vector - Valid index range for halfword operation is 0-7 -*/ -#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ - out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ - } -#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) - -#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ - { \ - SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ - SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ - } -#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) -#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even byte elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' are copied to the left half of - 'out0' & even byte elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ - } -#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) -#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) -#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) - -#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) -#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) -#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) - -/* Description : Pack even halfword elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' are copied to the left half of - 'out0' & even halfword elements of 'in1' are copied to the - right half of 'out0'. -*/ -#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ - } -#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) -#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) - -#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even double word elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double elements of 'in0' are copied to the left half of - 'out0' & even double elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ - out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ - } -#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) -#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) - -#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) - -/* Description : Each byte element is logically xor'ed with immediate 128 - Arguments : Inputs - in0, in1 - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned byte element from input vector 'in0' is - logically xor'ed with 128 and the result is stored in-place. -*/ -#define XORI_B2_128(RTYPE, in0, in1) \ - { \ - in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ - in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ - } -#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) -#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) - -#define XORI_B3_128(RTYPE, in0, in1, in2) \ - { \ - XORI_B2_128(RTYPE, in0, in1); \ - in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ - } -#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) - -#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ - { \ - XORI_B2_128(RTYPE, in0, in1); \ - XORI_B2_128(RTYPE, in2, in3); \ - } -#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) -#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) - -#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ - { \ - XORI_B4_128(RTYPE, in0, in1, in2, in3); \ - XORI_B3_128(RTYPE, in4, in5, in6); \ - } -#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) - -/* Description : Average of signed halfword elements -> (a + b) / 2 - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each signed halfword element from 'in0' is added to each - signed halfword element of 'in1' with full precision resulting - in one extra bit in the result. The result is then divided by - 2 and written to 'out0' -*/ -#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ - out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ - out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ - } -#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) - -/* Description : Addition of signed halfword elements and signed saturation - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'in0' are added to signed - halfword elements of 'in1'. The result is then signed saturated - between halfword data type range -*/ -#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ - } -#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) - -#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) - -/* Description : Shift left all elements of vector (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is left shifted by 'shift' and - the result is written in-place. -*/ -#define SLLI_4V(in0, in1, in2, in3, shift) \ - { \ - in0 = in0 << shift; \ - in1 = in1 << shift; \ - in2 = in2 << shift; \ - in3 = in3 << shift; \ - } - -/* Description : Arithmetic shift right all elements of vector - (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is a GP variable. -*/ -#define SRA_4V(in0, in1, in2, in3, shift) \ - { \ - in0 = in0 >> shift; \ - in1 = in1 >> shift; \ - in2 = in2 >> shift; \ - in3 = in3 >> shift; \ - } - -/* Description : Shift right arithmetic rounded words - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the number of bits in the corresponding element in the vector - 'shift'. The last discarded bit is added to shifted value for - rounding and the result is written in-place. - 'shift' is a vector. -*/ -#define SRAR_W2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ - in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ - } - -#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRAR_W2(RTYPE, in0, in1, shift) \ - SRAR_W2(RTYPE, in2, in3, shift) \ - } -#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) - -/* Description : Shift right arithmetic rounded (immediate) - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the value in 'shift'. The last discarded bit is added to the - shifted value for rounding and the result is written in-place. - 'shift' is an immediate value. -*/ -#define SRARI_H2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ - in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ - } -#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) -#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) - -#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRARI_H2(RTYPE, in0, in1, shift); \ - SRARI_H2(RTYPE, in2, in3, shift); \ - } -#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) -#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) - -#define SRARI_W2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ - in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ - } -#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) - -#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRARI_W2(RTYPE, in0, in1, shift); \ - SRARI_W2(RTYPE, in2, in3, shift); \ - } -#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) - -/* Description : Logical shift right all elements of vector (immediate) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is an immediate value. -*/ -#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ - { \ - out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ - out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ - out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ - out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ - } -#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) - -/* Description : Multiplication of pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element from 'in0' is multiplied with elements from 'in1' - and the result is written to 'out0' -*/ -#define MUL2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 * in1; \ - out1 = in2 * in3; \ - } -#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - MUL2(in0, in1, in2, in3, out0, out1); \ - MUL2(in4, in5, in6, in7, out2, out3); \ - } - -/* Description : Addition of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in0' is added to 'in1' and result is written - to 'out0'. -*/ -#define ADD2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 + in1; \ - out1 = in2 + in3; \ - } -#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - ADD2(in0, in1, in2, in3, out0, out1); \ - ADD2(in4, in5, in6, in7, out2, out3); \ - } - -/* Description : Subtraction of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in1' is subtracted from 'in0' and result is - written to 'out0'. -*/ -#define SUB2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - } -#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - out2 = in4 - in5; \ - out3 = in6 - in7; \ - } - -/* Description : Sign extend halfword elements from right half of the vector - Arguments : Input - in (halfword vector) - Output - out (sign extended word vector) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved with same vector 'in0' to generate - 4 word elements keeping sign intact -*/ -#define UNPCK_R_SH_SW(in, out) \ - { \ - v8i16 sign_m; \ - \ - sign_m = __msa_clti_s_h((v8i16)in, 0); \ - out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ - } - -/* Description : Zero extend unsigned byte elements to halfword elements - Arguments : Input - in (unsigned byte vector) - Outputs - out0, out1 (unsigned halfword vectors) - Return Type - signed halfword - Details : Zero extended right half of vector is returned in 'out0' - Zero extended left half of vector is returned in 'out1' -*/ -#define UNPCK_UB_SH(in, out0, out1) \ - { \ - v16i8 zero_m = { 0 }; \ - \ - ILVRL_B2_SH(zero_m, in, out0, out1); \ - } - -/* Description : Sign extend halfword elements from input vector and return - the result in pair of vectors - Arguments : Input - in (halfword vector) - Outputs - out0, out1 (sign extended word vectors) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved right with same vector 'in0' to - generate 4 signed word elements in 'out0' - Then interleaved left with same vector 'in0' to - generate 4 signed word elements in 'out1' -*/ -#define UNPCK_SH_SW(in, out0, out1) \ - { \ - v8i16 tmp_m; \ - \ - tmp_m = __msa_clti_s_h((v8i16)in, 0); \ - ILVRL_H2_SW(tmp_m, in, out0, out1); \ - } - -/* Description : Butterfly of 4 input vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Details : Butterfly operation -*/ -#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = in0 + in3; \ - out1 = in1 + in2; \ - \ - out2 = in1 - in2; \ - out3 = in0 - in3; \ - } - -/* Description : Butterfly of 8 input vectors - Arguments : Inputs - in0 ... in7 - Outputs - out0 .. out7 - Details : Butterfly operation -*/ -#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - out0 = in0 + in7; \ - out1 = in1 + in6; \ - out2 = in2 + in5; \ - out3 = in3 + in4; \ - \ - out4 = in3 - in4; \ - out5 = in2 - in5; \ - out6 = in1 - in6; \ - out7 = in0 - in7; \ - } - -/* Description : Butterfly of 16 input vectors - Arguments : Inputs - in0 ... in15 - Outputs - out0 .. out15 - Details : Butterfly operation -*/ -#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ - in11, in12, in13, in14, in15, out0, out1, out2, out3, \ - out4, out5, out6, out7, out8, out9, out10, out11, out12, \ - out13, out14, out15) \ - { \ - out0 = in0 + in15; \ - out1 = in1 + in14; \ - out2 = in2 + in13; \ - out3 = in3 + in12; \ - out4 = in4 + in11; \ - out5 = in5 + in10; \ - out6 = in6 + in9; \ - out7 = in7 + in8; \ - \ - out8 = in7 - in8; \ - out9 = in6 - in9; \ - out10 = in5 - in10; \ - out11 = in4 - in11; \ - out12 = in3 - in12; \ - out13 = in2 - in13; \ - out14 = in1 - in14; \ - out15 = in0 - in15; \ - } - -/* Description : Transpose input 8x8 byte block - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7) \ - { \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ - tmp3_m); \ - ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ - ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ - ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ - ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ - SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ - SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ - } -#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) - -/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - unsigned byte -*/ -#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ - in10, in11, in12, in13, in14, in15, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ - ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ - ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ - ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ - \ - tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ - tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ - tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ - tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ - out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ - tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ - out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ - tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ - \ - ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ - out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ - out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ - out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - } - -/* Description : Transpose 4x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed halfword -*/ -#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 s0_m, s1_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ - ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ - } - -/* Description : Transpose 4x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ - v8i16 zero_m = { 0 }; \ - \ - ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ - tmp3_n); \ - ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ - ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ - \ - out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - \ - out4 = zero_m; \ - out5 = zero_m; \ - out6 = zero_m; \ - out7 = zero_m; \ - } - -/* Description : Transpose 8x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ - ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ - ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ - } - -/* Description : Transpose 8x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ - ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ - ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ - PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ - tmp7_m, out0, out2, out4, out6); \ - out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ - } -#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) - -/* Description : Transpose 4x4 block with word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed word -*/ -#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ - out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ - out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ - out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ - } - -/* Description : Add block 4x4 - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Least significant 4 bytes from each input vector are added to - the destination bytes, clipped between 0-255 and stored. -*/ -#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ - { \ - uint32_t src0_m, src1_m, src2_m, src3_m; \ - v8i16 inp0_m, inp1_m, res0_m, res1_m; \ - v16i8 dst0_m = { 0 }; \ - v16i8 dst1_m = { 0 }; \ - v16i8 zero_m = { 0 }; \ - \ - ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ - LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ - INSERT_W2_SB(src0_m, src1_m, dst0_m); \ - INSERT_W2_SB(src2_m, src3_m, dst1_m); \ - ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ - ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ - CLIP_SH2_0_255(res0_m, res1_m); \ - PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ - ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ - } - -/* Description : Pack even elements of input vectors & xor with 128 - Arguments : Inputs - in0, in1 - Output - out_m - Return Type - unsigned byte - Details : Signed byte even elements from 'in0' and 'in1' are packed - together in one vector and the resulting vector is xor'ed with - 128 to shift the range from signed to unsigned byte -*/ -#define PCKEV_XORI128_UB(in0, in1) \ - ({ \ - v16u8 out_m; \ - \ - out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ - out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ - out_m; \ - }) - -/* Description : Converts inputs to unsigned bytes, interleave, average & store - as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride -*/ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ - pdst, stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ - } - -/* Description : Pack even byte elements and store byte vector in destination - memory - Arguments : Inputs - in0, in1, pdst -*/ -#define PCKEV_ST_SB(in0, in1, pdst) \ - { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ - ST_SB(tmp_m, (pdst)); \ - } - -/* Description : Horizontal 2 tap filter kernel code - Arguments : Inputs - in0, in1, mask, coeff, shift -*/ -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ - ({ \ - v16i8 tmp0_m; \ - v8u16 tmp1_m; \ - \ - tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ - tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ - tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ - \ - tmp1_m; \ - }) -#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c deleted file mode 100644 index 58cdd80d9..000000000 --- a/third_party/aom/aom_dsp/mips/sad_msa.c +++ /dev/null @@ -1,800 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ - { \ - out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ - } -#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) - -static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - diff = __msa_asub_u_b(src, ref); - sad += __msa_hadd_u_h(diff, diff); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - LD_UB2(ref, ref_stride, ref0, ref1); - ref += (2 * ref_stride); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - LD_UB2(ref, ref_stride, ref0, ref1); - ref += (2 * ref_stride); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t sad = 0; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = HADD_UH_U32(sad0); - sad += HADD_UH_U32(sad1); - - return sad; -} - -static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - - LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref0_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref1_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref2_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref3_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - int32_t ht_cnt; - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; - v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref0_ptr += (4 * ref_stride); - LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); - ref1_ptr += (4 * ref_stride); - LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); - ref2_ptr += (4 * ref_stride); - LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); - ref3_ptr += (4 * ref_stride); - - PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - int32_t ht_cnt; - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - v16u8 src, ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref0 = LD_UB(ref0_ptr); - ref0_ptr += ref_stride; - ref1 = LD_UB(ref1_ptr); - ref1_ptr += ref_stride; - ref2 = LD_UB(ref2_ptr); - ref2_ptr += ref_stride; - ref3 = LD_UB(ref3_ptr); - ref3_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref1); - sad1 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref2); - sad2 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref3); - sad3 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref0 = LD_UB(ref0_ptr); - ref0_ptr += ref_stride; - ref1 = LD_UB(ref1_ptr); - ref1_ptr += ref_stride; - ref2 = LD_UB(ref2_ptr); - ref2_ptr += ref_stride; - ref3 = LD_UB(ref3_ptr); - ref3_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref1); - sad1 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref2); - sad2 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref3); - sad3 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - - LD_UB2(ref0_ptr, 16, ref0, ref1); - ref0_ptr += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref1_ptr, 16, ref0, ref1); - ref1_ptr += ref_stride; - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref2_ptr, 16, ref0, ref1); - ref2_ptr += ref_stride; - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref3_ptr, 16, ref0, ref1); - ref3_ptr += ref_stride; - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - - LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); - ref0_ptr += ref_stride; - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); - ref1_ptr += ref_stride; - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); - ref2_ptr += ref_stride; - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); - ref3_ptr += ref_stride; - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad_array[0] = HADD_UH_U32(sad0_0); - sad_array[0] += HADD_UH_U32(sad0_1); - sad_array[1] = HADD_UH_U32(sad1_0); - sad_array[1] += HADD_UH_U32(sad1_1); - sad_array[2] = HADD_UH_U32(sad2_0); - sad_array[2] += HADD_UH_U32(sad2_1); - sad_array[3] = HADD_UH_U32(sad3_0); - sad_array[3] += HADD_UH_U32(sad3_1); -} - -static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff, pred, comp; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - comp = __msa_aver_u_b(pred, ref); - diff = __msa_asub_u_b(src, comp); - sad += __msa_hadd_u_h(diff, diff); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 diff0, diff1, pred0, pred1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); - sad += SAD_UB2_UH(src0, src1, diff0, diff1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3, comp0, comp1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 3); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += (4 * 16); - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += (4 * 16); - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; - v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 comp0, comp1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 16, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); - LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); - ref += (4 * ref_stride); - - LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); - LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); - sec_pred += (4 * 32); - - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); - sad += SAD_UB2_UH(src4, src5, comp0, comp1); - AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); - sad += SAD_UB2_UH(src6, src7, comp0, comp1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 comp0, comp1, comp2, comp3; - v16u8 pred0, pred1, pred2, pred3; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v4u32 sad; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - } - - sad = __msa_hadd_u_w(sad0, sad0); - sad += __msa_hadd_u_w(sad1, sad1); - - return HADD_SW_S32(sad); -} - -#define AOM_SAD_4xHEIGHT_MSA(height) \ - uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_8xHEIGHT_MSA(height) \ - uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_16xHEIGHT_MSA(height) \ - uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_32xHEIGHT_MSA(height) \ - uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_64xHEIGHT_MSA(height) \ - uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ - void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ - void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ - void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ - void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ - void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ - uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ - uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ - uint32_t aom_sad16x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ - uint32_t aom_sad32x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ - uint32_t aom_sad64x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -/* clang-format off */ -// 64x64 -AOM_SAD_64xHEIGHT_MSA(64) -AOM_SAD_64xHEIGHTx4D_MSA(64) -AOM_AVGSAD_64xHEIGHT_MSA(64) - -// 64x32 -AOM_SAD_64xHEIGHT_MSA(32) -AOM_SAD_64xHEIGHTx4D_MSA(32) -AOM_AVGSAD_64xHEIGHT_MSA(32) - -// 32x64 -AOM_SAD_32xHEIGHT_MSA(64) -AOM_SAD_32xHEIGHTx4D_MSA(64) -AOM_AVGSAD_32xHEIGHT_MSA(64) - -// 32x32 -AOM_SAD_32xHEIGHT_MSA(32) -AOM_SAD_32xHEIGHTx4D_MSA(32) -AOM_AVGSAD_32xHEIGHT_MSA(32) - -// 32x16 -AOM_SAD_32xHEIGHT_MSA(16) -AOM_SAD_32xHEIGHTx4D_MSA(16) -AOM_AVGSAD_32xHEIGHT_MSA(16) - -// 16x32 -AOM_SAD_16xHEIGHT_MSA(32) -AOM_SAD_16xHEIGHTx4D_MSA(32) -AOM_AVGSAD_16xHEIGHT_MSA(32) - -// 16x16 -AOM_SAD_16xHEIGHT_MSA(16) -AOM_SAD_16xHEIGHTx4D_MSA(16) -AOM_AVGSAD_16xHEIGHT_MSA(16) - -// 16x8 -AOM_SAD_16xHEIGHT_MSA(8) -AOM_SAD_16xHEIGHTx4D_MSA(8) -AOM_AVGSAD_16xHEIGHT_MSA(8) - -// 8x16 -AOM_SAD_8xHEIGHT_MSA(16) -AOM_SAD_8xHEIGHTx4D_MSA(16) -AOM_AVGSAD_8xHEIGHT_MSA(16) - -// 8x8 -AOM_SAD_8xHEIGHT_MSA(8) -AOM_SAD_8xHEIGHTx4D_MSA(8) -AOM_AVGSAD_8xHEIGHT_MSA(8) - -// 8x4 -AOM_SAD_8xHEIGHT_MSA(4) -AOM_SAD_8xHEIGHTx4D_MSA(4) -AOM_AVGSAD_8xHEIGHT_MSA(4) - -// 4x8 -AOM_SAD_4xHEIGHT_MSA(8) -AOM_SAD_4xHEIGHTx4D_MSA(8) -AOM_AVGSAD_4xHEIGHT_MSA(8) - -// 4x4 -AOM_SAD_4xHEIGHT_MSA(4) -AOM_SAD_4xHEIGHTx4D_MSA(4) -AOM_AVGSAD_4xHEIGHT_MSA(4) - /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c deleted file mode 100644 index 810b6efaa..000000000 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ /dev/null @@ -1,1792 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/variance.h" - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ - } - -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) - -#define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) - -static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t height, - int32_t *diff) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 pred, src = { 0 }; - v16u8 ref = { 0 }; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t height, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src, ref, pred; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v8i16 avg2 = { 0 }; - v8i16 avg3 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 32; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - vec += __msa_hadd_s_w(avg2, avg2); - vec += __msa_hadd_s_w(avg3, avg3); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_4width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - CALC_MSE_AVG_B(src0, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 filt0, out, ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, dst0, var, avg); - CALC_MSE_AVG_B(src1, dst1, var, avg); - CALC_MSE_AVG_B(src2, dst2, var, avg); - CALC_MSE_AVG_B(src3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4, out; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 ref = { 0 }; - v16u8 src2110, src4332; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out, ref = { 0 }; - v16u8 filt_vt, filt_hz, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt_vt, filt_hz, vec0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - CALC_MSE_AVG_B(src2, ref2, var, avg); - CALC_MSE_AVG_B(src3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 out, pred, filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 out, pred, filt0; - v16u8 ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16u8 pred0, pred1, pred2, pred3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, - tmp2, tmp3); - AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, - tmp2, tmp3); - - CALC_MSE_AVG_B(tmp0, dst0, var, avg); - CALC_MSE_AVG_B(tmp1, dst1, var, avg); - CALC_MSE_AVG_B(tmp2, dst2, var, avg); - CALC_MSE_AVG_B(tmp3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 out, pred, ref = { 0 }; - v16u8 src2110, src4332, filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, filt0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, - out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 out, pred, ref = { 0 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 pred0, pred1, out0, out1; - v16u8 filt_hz, filt_vt, vec0; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 out0, out1, out2, out3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, - out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); -#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); -#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); - -#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); - -#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sse) { \ - int32_t diff; \ - uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ - src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ - src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ - } \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ - src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ - sse); \ - } \ - } \ - \ - return var; \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) -/* clang-format on */ - -#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ - &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ - &diff); \ - } else { \ - *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ - ref_stride, sec_pred, ht, &diff); \ - } \ - } \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) -/* clang-format on */ - -uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - int32_t xoffset, int32_t yoffset, - const uint8_t *ref_ptr, - int32_t ref_stride, uint32_t *sse, - const uint8_t *sec_pred) { - int32_t diff; - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; - - if (yoffset) { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_hv_msa( - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, - v_filter, 64, &diff); - } else { - *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, - ref_stride, sec_pred, - v_filter, 64, &diff); - } - } else { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, - ref_stride, sec_pred, - h_filter, 64, &diff); - } else { - *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, - sec_pred, &diff); - } - } - - return VARIANCE_32Wx64H(*sse, diff); -} - -#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ - uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ - &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ - &diff); \ - } else { \ - *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ - ref_stride, sec_pred, &diff); \ - } \ - } \ - \ - return VARIANCE_64Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) -AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) -/* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c deleted file mode 100644 index bfed773ac..000000000 --- a/third_party/aom/aom_dsp/mips/subtract_msa.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t src0, src1, src2, src3; - uint32_t pred0, pred1, pred2, pred3; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - LW4(src_ptr, src_stride, src0, src1, src2, src3); - LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); - INSERT_W4_SB(src0, src1, src2, src3, src); - INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); -} - -static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t loop_cnt; - uint64_t src0, src1, pred0, pred1; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 4; loop_cnt--;) { - LD2(src_ptr, src_stride, src0, src1); - src_ptr += (2 * src_stride); - LD2(pred_ptr, pred_stride, pred0, pred1); - pred_ptr += (2 * pred_stride); - - INSERT_D2_SB(src0, src1, src); - INSERT_D2_SB(pred0, pred1, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff_ptr, diff_stride); - diff_ptr += (2 * diff_stride); - } -} - -static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - int8_t count; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (count = 2; count--;) { - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, - pred7); - pred += (8 * pred_stride); - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - } -} - -static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 8; loop_cnt--;) { - LD_SB2(src, 16, src0, src1); - src += src_stride; - LD_SB2(src, 16, src2, src3); - src += src_stride; - LD_SB2(src, 16, src4, src5); - src += src_stride; - LD_SB2(src, 16, src6, src7); - src += src_stride; - - LD_SB2(pred, 16, pred0, pred1); - pred += pred_stride; - LD_SB2(pred, 16, pred2, pred3); - pred += pred_stride; - LD_SB2(pred, 16, pred4, pred5); - pred += pred_stride; - LD_SB2(pred, 16, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - } -} - -static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 32; loop_cnt--;) { - LD_SB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_SB4(src, 16, src4, src5, src6, src7); - src += src_stride; - - LD_SB4(pred, 16, pred0, pred1, pred2, pred3); - pred += pred_stride; - LD_SB4(pred, 16, pred4, pred5, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - } -} - -void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, - ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, - ptrdiff_t pred_stride) { - if (rows == cols) { - switch (rows) { - case 4: - sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 8: - sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 16: - sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 32: - sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 64: - sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - default: - aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - } - } else { - aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, - pred_ptr, pred_stride); - } -} diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c deleted file mode 100644 index 065c09ac5..000000000 --- a/third_party/aom/aom_dsp/mips/variance_msa.c +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define CALC_MSE_B(src, ref, var) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - } - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ - } - -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) - -#define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) - -static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - int32_t ht_cnt; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src, ref; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v8i16 avg2 = { 0 }; - v8i16 avg3 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 32; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - vec += __msa_hadd_s_w(avg2, avg2); - vec += __msa_hadd_s_w(avg3, avg3); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t get_mb_ss_msa(const int16_t *src) { - uint32_t sum, cnt; - v8i16 src0, src1, src2, src3; - v4i32 src0_l, src1_l, src2_l, src3_l; - v4i32 src0_r, src1_r, src2_r, src3_r; - v2i64 sq_src_l = { 0 }; - v2i64 sq_src_r = { 0 }; - - for (cnt = 8; cnt--;) { - LD_SH4(src, 8, src0, src1, src2, src3); - src += 4 * 8; - - UNPCK_SH_SW(src0, src0_l, src0_r); - UNPCK_SH_SW(src1, src1_l, src1_r); - UNPCK_SH_SW(src2, src2_l, src2_r); - UNPCK_SH_SW(src3, src3_l, src3_r); - - DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); - } - - sq_src_l += __msa_splati_d(sq_src_l, 1); - sq_src_r += __msa_splati_d(sq_src_r, 1); - - sum = __msa_copy_s_d(sq_src_l, 0); - sum += __msa_copy_s_d(sq_src_r, 0); - - return sum; -} - -static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - CALC_MSE_B(src, ref, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src, ref; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v4i32 var = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src2, ref2, var); - CALC_MSE_B(src1, ref1, var); - CALC_MSE_B(src3, ref3, var); - - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src2, ref2, var); - CALC_MSE_B(src1, ref1, var); - CALC_MSE_B(src3, ref3, var); - } - - return HADD_SW_S32(var); -} - -uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride) { - uint32_t err = 0; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16i8 src = { 0 }; - v16i8 ref = { 0 }; - v16u8 src_vec0, src_vec1; - v8i16 diff0, diff1; - v4i32 err0 = { 0 }; - v4i32 err1 = { 0 }; - - LW4(src_ptr, src_stride, src0, src1, src2, src3); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_SB(src0, src1, src2, src3, src); - INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); - ILVRL_B2_UB(src, ref, src_vec0, src_vec1); - HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); - DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); - err = HADD_SW_S32(err0); - err += HADD_SW_S32(err1); - - return err; -} - -#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); -#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); -#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); - -#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); - -#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, uint32_t *sse) { \ - int32_t diff; \ - \ - *sse = \ - sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_VARIANCE_WDXHT_MSA(4, 4) -AOM_VARIANCE_WDXHT_MSA(4, 8) - -AOM_VARIANCE_WDXHT_MSA(8, 4) -AOM_VARIANCE_WDXHT_MSA(8, 8) -AOM_VARIANCE_WDXHT_MSA(8, 16) - -AOM_VARIANCE_WDXHT_MSA(16, 8) -AOM_VARIANCE_WDXHT_MSA(16, 16) -AOM_VARIANCE_WDXHT_MSA(16, 32) - -AOM_VARIANCE_WDXHT_MSA(32, 16) -AOM_VARIANCE_WDXHT_MSA(32, 32) -/* clang-format on */ - -uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_32Wx64H(*sse, diff); -} - -uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_64Wx32H(*sse, diff); -} - -uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_64Wx64H(*sse, diff); -} - -uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { - *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); - - return *sse; -} - -uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); - - return *sse; -} - -uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); - - return *sse; -} - -uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); - - return *sse; -} - -void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse, - int32_t *sum) { - *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); -} - -void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse, - int32_t *sum) { - *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); -} - -uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } |