diff options
Diffstat (limited to 'third_party/aom/aom_dsp/mips')
59 files changed, 34508 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c new file mode 100644 index 000000000..4c6e201e1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> +#include "./macros_msa.h" + +void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, + char blackclamp[16], char whiteclamp[16], + char bothclamp[16], uint32_t width, + uint32_t height, int32_t pitch) { + uint32_t i, j; + + for (i = 0; i < height / 2; ++i) { + uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; + int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); + uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; + int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); + for (j = width / 16; j--;) { + v16i8 temp00_s, temp01_s; + v16u8 temp00, temp01, black_clamp, white_clamp; + v16u8 pos0, ref0, pos1, ref1; + v16i8 const127 = __msa_ldi_b(127); + + pos0 = LD_UB(pos0_ptr); + ref0 = LD_UB(ref0_ptr); + pos1 = LD_UB(pos1_ptr); + ref1 = LD_UB(ref1_ptr); + black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); + white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); + temp00 = (pos0 < black_clamp); + pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); + temp01 = (pos1 < black_clamp); + pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); + XORI_B2_128_UB(pos0, pos1); + temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp00 = (v16u8)(temp00_s < pos0); + pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); + temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp01 = (temp01_s < pos1); + pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); + XORI_B2_128_UB(pos0, pos1); + pos0 += ref0; + ST_UB(pos0, pos0_ptr); + pos1 += ref1; + ST_UB(pos1, pos1_ptr); + pos0_ptr += 16; + pos1_ptr += 16; + ref0_ptr += 16; + ref1_ptr += 16; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c new file mode 100644 index 000000000..847394a3d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, res0, res1); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + XORI_B2_128_UB(res2, res3); + AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); + AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + } +} + +static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 32: + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c new file mode 100644 index 000000000..bed600d5b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); + XORI_B2_128_UB(tmp0, tmp1); + AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0, dst1, dst2, dst3, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, + tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, + res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c new file mode 100644 index 000000000..dae771104 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, + filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, + filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, + filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, + dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, + dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, + dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, + dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c new file mode 100644 index 000000000..fc3a823c5 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1, out2, + out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c new file mode 100644 index 000000000..a4d594931 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + out0 = out2; + out1 = out3; + out2 = out4; + } +} + +static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int32_t x_step_q4, const int16_t *filter_y, + int32_t y_step_q4, int32_t w, int32_t h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c new file mode 100644 index 000000000..f7bdfc2bd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c new file mode 100644 index 000000000..75f8c7ea8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/macros_msa.h" + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + out2 = __msa_copy_u_w((v4i32)dst2, 0); + out3 = __msa_copy_u_w((v4i32)dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_d((v2i64)dst0, 0); + out1 = __msa_copy_u_d((v2i64)dst1, 0); + out2 = __msa_copy_u_d((v2i64)dst2, 0); + out3 = __msa_copy_u_d((v2i64)dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void avg_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 8); cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); + dst_dup += dst_stride; + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += dst_stride; + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); + dst += dst_stride; + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); + dst += dst_stride; + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); + dst += dst_stride; + } +} + +void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + avg_width4_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 8: { + avg_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c new file mode 100644 index 000000000..f7f116f4d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <string.h> +#include "aom_dsp/mips/macros_msa.h" + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + uint32_t cnt, tmp; + /* 1 word storage */ + for (cnt = h; cnt--;) { + tmp = LW(src); + SW(tmp, dst); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h new file mode 100644 index 000000000..1a0ae4d8d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" + +extern const uint8_t mc_filt_mask_arr[16 * 3]; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ + }) + +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ + filt_h1, filt_h2, filt_h3) \ + ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ + vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ + filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ + stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } +#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/avg_msa.c b/third_party/aom/aom_dsp/mips/avg_msa.c new file mode 100644 index 000000000..0e1728155 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/avg_msa.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +uint32_t aom_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t aom_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c new file mode 100644 index 000000000..00ab75dc3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *aom_ff_cropTbl; + +void aom_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + aom_ff_cropTbl_a[i] = 0; + aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h new file mode 100644 index 000000000..31159fdcd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_MIPS_DSPR2_H_ +#define AOM_COMMON_MIPS_DSPR2_H_ + +#include <assert.h> +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if HAVE_DSPR2 +#define CROP_WIDTH 512 + +extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" + +static INLINE void prefetch_load(const unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +static INLINE void prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_MIPS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c new file mode 100644 index 000000000..d557115b9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, + w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c new file mode 100644 index 000000000..efbdcf60f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3; + uint32_t tn1, tn2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3, tp4; + uint32_t p1, p2, p3, p4, n1; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tp4], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tp4], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tp3], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp4], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tp1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" + + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" + + /* store bytes */ + "sb %[tp3], 3(%[dst]) \n\t" + "sb %[tp4], 5(%[dst]) \n\t" + "sb %[tp1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c new file mode 100644 index 000000000..066308315 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c new file mode 100644 index 000000000..dc51ab1cb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c new file mode 100644 index 000000000..3367be01a --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c new file mode 100644 index 000000000..298065adb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + if (intermediate_height < h) intermediate_height = h; + + aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, intermediate_height); + + aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + int x, y; + uint32_t tp1, tp2, tn1; + uint32_t tp3, tp4, tn2; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c new file mode 100644 index 000000000..f6534b420 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -0,0 +1,998 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, + h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c new file mode 100644 index 000000000..c871702f4 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -0,0 +1,1590 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x * dst_stride] = src[x]; + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (intermediate_height < h) intermediate_height = h; + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + copy_horiz_transposed(src - src_stride * 3, src_stride, temp, + intermediate_height, w, intermediate_height); + } else if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp, + intermediate_height, filter_x, w, intermediate_height); + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); + } else if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, + filter_y, h, w); + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w, (h / 16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 8: { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 16: { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 32: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 64: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + default: + for (y = h; y--;) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c new file mode 100644 index 000000000..c60557617 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c new file mode 100644 index 000000000..d8a90b6ab --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h new file mode 100644 index 000000000..f8fd9e2b6 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ + +#include <assert.h> + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c new file mode 100644 index 000000000..dc9c63226 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, + vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, + in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, + tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, + vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, + 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, + 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} + +void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16, stride); + sum += LD_HADD(input + 24, stride); + sum += LD_HADD(input + 32 * 8, stride); + sum += LD_HADD(input + 32 * 8 + 8, stride); + sum += LD_HADD(input + 32 * 8 + 16, stride); + sum += LD_HADD(input + 32 * 8 + 24, stride); + sum += LD_HADD(input + 32 * 16, stride); + sum += LD_HADD(input + 32 * 16 + 8, stride); + sum += LD_HADD(input + 32 * 16 + 16, stride); + sum += LD_HADD(input + 32 * 16 + 24, stride); + sum += LD_HADD(input + 32 * 24, stride); + sum += LD_HADD(input + 32 * 24 + 8, stride); + sum += LD_HADD(input + 32 * 24 + 16, stride); + sum += LD_HADD(input + 32 * 24 + 24, stride); + out[0] = (int16_t)(sum >> 3); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c new file mode 100644 index 000000000..f16d290c8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; + v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 + }; + + LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, + in10, in11, in12, in13, in14, in15); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in8, in9, in10, in11, 2); + SLLI_4V(in12, in13, in14, in15, 2); + ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); + ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); + SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); + SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); + ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); + + cnst4 = __msa_splati_h(coeff, 0); + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); + + cnst5 = __msa_splati_h(coeff, 1); + cnst5 = __msa_ilvev_h(cnst5, cnst4); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); + stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); + stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); + + /* stp2 */ + BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); + ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); + SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); + + cnst0 = __msa_splati_h(coeff, 4); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); + + BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + ILVRL_H2_SH(in15, in8, vec1, vec0); + SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr); + + cnst0 = __msa_splati_h(coeff2, 0); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 224); + + ILVRL_H2_SH(in14, in9, vec1, vec0); + SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 128); + + cnst1 = __msa_splati_h(coeff2, 2); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 96); + + SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + cnst1 = __msa_splati_h(coeff, 3); + cnst1 = __msa_ilvev_h(cnst0, cnst1); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + /* stp4 */ + ADD2(stp34, stp25, stp33, stp22, in13, in10); + + ILVRL_H2_SH(in13, in10, vec1, vec0); + SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 64); + + cnst0 = __msa_splati_h(coeff2, 1); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 160); + + SUB2(stp34, stp25, stp33, stp22, in12, in11); + ILVRL_H2_SH(in12, in11, vec1, vec0); + SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 192); + + cnst1 = __msa_splati_h(coeff2, 3); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 32); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); + ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); + SRA_4V(in0, in1, in2, in3, 2); + SRA_4V(in4, in5, in6, in7, 2); + SRA_4V(in8, in9, in10, in11, 2); + SRA_4V(in12, in13, in14, in15, 2); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); +} + +void aom_fdct4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 vec, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + vec = __msa_ceqi_h(in0, 0); + vec = vec ^ 255; + vec = mask & vec; + in0 += vec; + } + + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void aom_fdct8x8_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} + +void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[0] = LD_HADD(input, stride); + out[1] = 0; +} + +void aom_fdct16x16_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} + +void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16 * 8, stride); + sum += LD_HADD(input + 16 * 8 + 8, stride); + out[0] = (int16_t)(sum >> 1); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h new file mode 100644 index 000000000..ada25dffd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_ +#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_ + +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define LD_HADD(psrc, stride) \ + ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ + in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ + }) + +#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { \ + cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ + }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ + vec7_m, out0, out2, out1, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ + in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ + in6, in7); \ + } + +#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { \ + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ + }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ + stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ + stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ + vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ + stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + } + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ + } + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t)const0); \ + \ + s0_m = __msa_fill_w((int32_t)const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + } + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c new file mode 100644 index 000000000..0ea127f52 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct16x16_msa.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8; + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, + reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, + reg9, reg10, reg11, reg12, reg13, reg14, reg15); + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, + reg8); + ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, + reg10); + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, + reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, + reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); +} + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); +} + +void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa(input, out); + + /* short case just considers top 4 rows as valid output */ + out += 4 * 16; + for (i = 12; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out]) \n\t" + "sw $zero, 4(%[out]) \n\t" + "sw $zero, 8(%[out]) \n\t" + "sw $zero, 12(%[out]) \n\t" + "sw $zero, 16(%[out]) \n\t" + "sw $zero, 20(%[out]) \n\t" + "sw $zero, 24(%[out]) \n\t" + "sw $zero, 28(%[out]) \n\t" + + : + : [out] "r"(out)); + + out += 16; + } + + out = out_arr; + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + int16_t out; + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, + l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, + l12, l13, l14, l15); + + /* ADST in horizontal */ + AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, + r12, r13, r14, r15); + + l1 = -r8; + l3 = -r4; + l13 = -r13; + l15 = -r1; + + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, + l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, + l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); +} + +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 v0, v2, v4, v6, k0, k1, k2, k3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 res8, res9, res10, res11, res12, res13, res14, res15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + v16i8 zero = { 0 }; + + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); + + /* stage 1 */ + k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); + k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); + k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); + BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); + BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); + out1 = -out1; + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); + + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + MADD_SHORT(out6, out7, k0, k3, out6, out7); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c new file mode 100644 index 000000000..f1ca757a0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct32x32_msa.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); + + /* 3rd & 4th 8x8 */ + LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); + ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 15 * 8)); + ST_SH(loc1, (tmp_eve_buf)); + ST_SH(loc2, (tmp_eve_buf + 14 * 8)); + ST_SH(loc3, (tmp_eve_buf + 8)); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 13 * 8)); + ST_SH(loc1, (tmp_eve_buf + 2 * 8)); + ST_SH(loc2, (tmp_eve_buf + 12 * 8)); + ST_SH(loc3, (tmp_eve_buf + 3 * 8)); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 11 * 8)); + ST_SH(loc1, (tmp_eve_buf + 4 * 8)); + ST_SH(loc2, (tmp_eve_buf + 10 * 8)); + ST_SH(loc3, (tmp_eve_buf + 5 * 8)); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 9 * 8)); + ST_SH(loc1, (tmp_eve_buf + 6 * 8)); + ST_SH(loc2, (tmp_eve_buf + 8 * 8)); + ST_SH(loc3, (tmp_eve_buf + 7 * 8)); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 8); + reg1 = LD_SH(tmp_buf + 7 * 8); + reg2 = LD_SH(tmp_buf + 9 * 8); + reg3 = LD_SH(tmp_buf + 15 * 8); + reg4 = LD_SH(tmp_buf + 17 * 8); + reg5 = LD_SH(tmp_buf + 23 * 8); + reg6 = LD_SH(tmp_buf + 25 * 8); + reg7 = LD_SH(tmp_buf + 31 * 8); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf), 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 8); + reg1 = LD_SH(tmp_buf + 5 * 8); + reg2 = LD_SH(tmp_buf + 11 * 8); + reg3 = LD_SH(tmp_buf + 13 * 8); + reg4 = LD_SH(tmp_buf + 19 * 8); + reg5 = LD_SH(tmp_buf + 21 * 8); + reg6 = LD_SH(tmp_buf + 27 * 8); + reg7 = LD_SH(tmp_buf + 29 * 8); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH(reg0, (tmp_odd_buf + 13 * 8)); + ST_SH(reg1, (tmp_odd_buf + 14 * 8)); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); + + /* 3rd & 4th 8x8 */ + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); +} + +static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, + m6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, + m7); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, + n6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, + n7); +} + +static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); + } + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + for (i = 32; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out_ptr]) \n\t" + "sw $zero, 4(%[out_ptr]) \n\t" + "sw $zero, 8(%[out_ptr]) \n\t" + "sw $zero, 12(%[out_ptr]) \n\t" + "sw $zero, 16(%[out_ptr]) \n\t" + "sw $zero, 20(%[out_ptr]) \n\t" + "sw $zero, 24(%[out_ptr]) \n\t" + "sw $zero, 28(%[out_ptr]) \n\t" + "sw $zero, 32(%[out_ptr]) \n\t" + "sw $zero, 36(%[out_ptr]) \n\t" + "sw $zero, 40(%[out_ptr]) \n\t" + "sw $zero, 44(%[out_ptr]) \n\t" + "sw $zero, 48(%[out_ptr]) \n\t" + "sw $zero, 52(%[out_ptr]) \n\t" + "sw $zero, 56(%[out_ptr]) \n\t" + "sw $zero, 60(%[out_ptr]) \n\t" + + : + : [out_ptr] "r"(out_ptr)); + + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_msa(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c new file mode 100644 index 000000000..274818baa --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct4x4_msa.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in2, in3, in1); + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + UNPCK_R_SH_SW(in0, in0_r); + UNPCK_R_SH_SW(in2, in2_r); + UNPCK_R_SH_SW(in3, in3_r); + UNPCK_R_SH_SW(in1, in1_r); + SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); + + in0_r += in2_r; + in3_r -= in1_r; + in4_r = (in0_r - in3_r) >> 1; + in1_r = in4_r - in1_r; + in2_r = in4_r - in2_r; + in0_r -= in1_r; + in3_r += in2_r; + + TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); + + in0_r += in1_r; + in2_r -= in3_r; + in4_r = (in0_r - in2_r) >> 1; + in3_r = in4_r - in3_r; + in1_r = in4_r - in1_r; + in0_r -= in3_r; + in2_r += in1_r; + + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, + in2, in3); + ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); +} + +void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t a1, e1; + v8i16 in1, in0 = { 0 }; + + a1 = input[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + + in0 = __msa_insert_h(in0, 0, a1); + in0 = __msa_insert_h(in0, 1, e1); + in0 = __msa_insert_h(in0, 2, e1); + in0 = __msa_insert_h(in0, 3, e1); + + in1 = in0 >> 1; + in0 -= in1; + + ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); +} + +void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c new file mode 100644 index 000000000..981c103cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct8x8_msa.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* rows transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, + in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c new file mode 100644 index 000000000..dc8f20208 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c new file mode 100644 index 000000000..ea7c02810 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[resl], (%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + "lbu %[left1], 1(%[left]) \n\t" + "lbu %[left2], 2(%[left]) \n\t" + "lbu %[left3], 3(%[left]) \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + + "preceu.ph.qbl %[abovel], %[resl] \n\t" + "preceu.ph.qbr %[abover], %[resl] \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "replv.ph %[left1], %[left1] \n\t" + "replv.ph %[left2], %[left2] \n\t" + "replv.ph %[left3], %[left3] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[resl], %[abovel], %[left0] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left0] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left1] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left1] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left2] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left2] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left3] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left3] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), + [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), + [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), + [resr] "=&r"(resr), [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c new file mode 100644 index 000000000..1114fbc00 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[reshw], (%[above]) \n\t" + "ulw %[top_left], 4(%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + + "preceu.ph.qbl %[abovel], %[reshw] \n\t" + "preceu.ph.qbr %[abover], %[reshw] \n\t" + "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" + "preceu.ph.qbr %[abover_1], %[top_left] \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + "replv.ph %[left0], %[left0] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 1(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 2(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 3(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 4(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 5(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 6(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 7(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), + [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), + [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), + [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), + [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c new file mode 100644 index 000000000..e8eaec7a9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +} diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h new file mode 100644 index 000000000..8a85e26f3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ + +#include <assert.h> + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ + ({ \ + \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 " \ + " \n\t" \ + "mthi $zero, $ac1 " \ + " \n\t" \ + "madd $ac1, %[in], " \ + "%[cospi_16_64] \n\t" \ + "extp %[tmp], $ac1, " \ + "31 \n\t" \ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 " \ + " \n\t" \ + "mthi $zero, $ac2 " \ + " \n\t" \ + "madd $ac2, %[tmp], " \ + "%[cospi_16_64] \n\t" \ + "extp %[out], $ac2, " \ + "31 \n\t" \ + \ + : [tmp] "=&r"(tmp), [out] "=r"(out) \ + : [in] "r"(in), \ + [dct_cost_rounding] "r"(dct_cost_rounding), \ + [cospi_16_64] "r"(cospi_16_64)); \ + out; \ + }) + +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output); +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst4_dspr2(const int16_t *input, int16_t *output); +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst8_dspr2(const int16_t *input, int16_t *output); +void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); +void iadst16_dspr2(const int16_t *input, int16_t *output); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h new file mode 100644 index 000000000..122667aa8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ +#define AOM_DSP_MIPS_INV_TXFM_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ + ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ + }) + +#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ + { \ + uint8_t *dst_m = (uint8_t *)(dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ + res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ + res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ + } + +#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ + } + +#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ + -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ + } + +#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ + ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ + }) + +/* multiply and add macro */ +#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ + } + +/* idct 8x8 macro */ +#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ + AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ + out1, out2, out3, out4, out5, out6, out7); \ + } + +#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ + cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ + -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { \ + -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ + }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ + r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ + m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ + m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ + } + +#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ + r12, r13, r14, r15, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ + g11_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ + g15_m); \ + \ + /* stage 2 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ + h3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ + h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ + h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ + out7); \ + MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ + out13, out15); \ + \ + /* stage 4 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ + } + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c new file mode 100644 index 000000000..c63b1e857 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c @@ -0,0 +1,1190 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--;) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), + [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), + [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), + [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), + [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); + + __asm__ __volatile__( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); + + input += 16; + output += 1; + } +} + +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) + : + [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); + + input += 16; + } +} + +void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct16_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + outptr += 2; + } + + // Then transform columns + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst16_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c new file mode 100644 index 000000000..d469d1ad0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), + [step2_31] "r"(step2_31)); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), + [step1_27] "r"(step1_27)); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), + [step1_23] "r"(step1_23)); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), + [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), + [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c new file mode 100644 index 000000000..fa7703217 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = no_rows; i--;) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r"(output)); + + output += 1; + + continue; + } + + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 32)); + prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64) + + ); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 32); + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + } + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, stride); +} + +void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c new file mode 100644 index 000000000..e6d0367cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); + + input += 4; + output += 1; + } +} + +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), + [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 4; + } +} + +void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + aom_idct4_rows_dspr2(input, outptr); + + // Columns + aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst4_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c new file mode 100644 index 000000000..0a20f76f2 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c @@ -0,0 +1,645 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), + [input] "r"(input)); + + input += 8; + output += 1; + } +} + +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 8; + } +} + +void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst8_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c new file mode 100644 index 000000000..fc0c32ce3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c @@ -0,0 +1,1487 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/mem.h" +#include "aom_dsp/mips/loopfilter_msa.h" + +int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + aom_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, + thresh_ptr, count); + } +} + +void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = + aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c new file mode 100644 index 000000000..dc0a97764 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c new file mode 100644 index 000000000..dc203e79c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void aom_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c new file mode 100644 index 000000000..883d0523d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); + + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); + } + } + } +} + +void aom_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h new file mode 100644 index 000000000..72df09823 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); + + __asm__ __volatile__( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h new file mode 100644 index 000000000..3e6994714 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } + +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } + +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } + +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } + +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h new file mode 100644 index 000000000..8db3e521f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c new file mode 100644 index 000000000..a3b5a9eb1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c new file mode 100644 index 000000000..8d2fd69f7 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); + + __asm__ __volatile__( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c new file mode 100644 index 000000000..28528869b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdlib.h> + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + __asm__ __volatile__( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h new file mode 100644 index 000000000..450594262 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_LOOPFILTER_MSA_H_ +#define AOM_DSP_LOOPFILTER_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp_flat5, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ + \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h new file mode 100644 index 000000000..48fbcfd47 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -0,0 +1,2057 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_DSP_MIPS_MACROS_MSA_H_ + +#include <msa.h> + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then the average + with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } +#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Inputs - in (unsigned halfword vector) + Outputs - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of input vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ + pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c new file mode 100644 index 000000000..258eb5c07 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sad_msa.c @@ -0,0 +1,1529 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } +#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) + +static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + diff = __msa_asub_u_b(src, ref); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t sad = 0; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + INSERT_W4_UB(src0, src1, src2, src3, src); + + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref, ref0, ref1, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v4u32 sad; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); + ref0_4 = LD_UB(ref + 64); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32((v4i32)sad); +} + +static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, diff; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref0, ref1, ref; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1; + v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + const uint8_t *src_dup, *ref_dup; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + v4u32 sad; + + src_dup = src; + ref_dup = ref; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_SW_S32(sad); + + sad0_0 = (v8u16)__msa_ldi_h(0); + sad0_1 = (v8u16)__msa_ldi_h(0); + sad1_0 = (v8u16)__msa_ldi_h(0); + sad1_1 = (v8u16)__msa_ldi_h(0); + sad2_0 = (v8u16)__msa_ldi_h(0); + sad2_1 = (v8u16)__msa_ldi_h(0); + sad3_0 = (v8u16)__msa_ldi_h(0); + sad3_1 = (v8u16)__msa_ldi_h(0); + + for (ht_cnt = 64; ht_cnt--;) { + LD_UB4(src_dup, 16, src0, src1, src2, src3); + src_dup += src_stride; + LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref_dup += ref_stride; + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[4] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[5] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[6] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[7] = HADD_SW_S32(sad); +} + +static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + + LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref0_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref1_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref2_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref3_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref0_ptr += (4 * ref_stride); + LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); + ref1_ptr += (4 * ref_stride); + LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); + ref2_ptr += (4 * ref_stride); + LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); + ref3_ptr += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + + LD_UB2(ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + + LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad_array[0] = HADD_UH_U32(sad0_0); + sad_array[0] += HADD_UH_U32(sad0_1); + sad_array[1] = HADD_UH_U32(sad1_0); + sad_array[1] += HADD_UH_U32(sad1_1); + sad_array[2] = HADD_UH_U32(sad2_0); + sad_array[2] += HADD_UH_U32(sad2_1); + sad_array[3] = HADD_UH_U32(sad3_0); + sad_array[3] += HADD_UH_U32(sad3_1); +} + +static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff, pred, comp; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + comp = __msa_aver_u_b(pred, ref); + diff = __msa_asub_u_b(src, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 diff0, diff1, pred0, pred1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); + sad += SAD_UB2_UH(src0, src1, diff0, diff1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3, comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); + LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); + ref += (4 * ref_stride); + + LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); + LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); + sec_pred += (4 * 32); + + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); + sad += SAD_UB2_UH(src4, src5, comp0, comp1); + AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); + sad += SAD_UB2_UH(src6, src7, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 comp0, comp1, comp2, comp3; + v16u8 pred0, pred1, pred2, pred3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v4u32 sad; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + } + + sad = __msa_hadd_u_w(sad0, sad0); + sad += __msa_hadd_u_w(sad1, sad1); + + return HADD_SW_S32(sad); +} + +#define AOM_SAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_4xHEIGHTx3_MSA(height) \ + void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx3_MSA(height) \ + void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx3_MSA(height) \ + void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx3_MSA(height) \ + void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx3_MSA(height) \ + void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx8_MSA(height) \ + void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx8_MSA(height) \ + void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx8_MSA(height) \ + void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx8_MSA(height) \ + void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx8_MSA(height) \ + void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ + void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ + void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ + void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ + void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ + void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +/* clang-format off */ +// 64x64 +AOM_SAD_64xHEIGHT_MSA(64) +AOM_SAD_64xHEIGHTx3_MSA(64) +AOM_SAD_64xHEIGHTx8_MSA(64) +AOM_SAD_64xHEIGHTx4D_MSA(64) +AOM_AVGSAD_64xHEIGHT_MSA(64) + +// 64x32 +AOM_SAD_64xHEIGHT_MSA(32) +AOM_SAD_64xHEIGHTx3_MSA(32) +AOM_SAD_64xHEIGHTx8_MSA(32) +AOM_SAD_64xHEIGHTx4D_MSA(32) +AOM_AVGSAD_64xHEIGHT_MSA(32) + +// 32x64 +AOM_SAD_32xHEIGHT_MSA(64) +AOM_SAD_32xHEIGHTx3_MSA(64) +AOM_SAD_32xHEIGHTx8_MSA(64) +AOM_SAD_32xHEIGHTx4D_MSA(64) +AOM_AVGSAD_32xHEIGHT_MSA(64) + +// 32x32 +AOM_SAD_32xHEIGHT_MSA(32) +AOM_SAD_32xHEIGHTx3_MSA(32) +AOM_SAD_32xHEIGHTx8_MSA(32) +AOM_SAD_32xHEIGHTx4D_MSA(32) +AOM_AVGSAD_32xHEIGHT_MSA(32) + +// 32x16 +AOM_SAD_32xHEIGHT_MSA(16) +AOM_SAD_32xHEIGHTx3_MSA(16) +AOM_SAD_32xHEIGHTx8_MSA(16) +AOM_SAD_32xHEIGHTx4D_MSA(16) +AOM_AVGSAD_32xHEIGHT_MSA(16) + +// 16x32 +AOM_SAD_16xHEIGHT_MSA(32) +AOM_SAD_16xHEIGHTx3_MSA(32) +AOM_SAD_16xHEIGHTx8_MSA(32) +AOM_SAD_16xHEIGHTx4D_MSA(32) +AOM_AVGSAD_16xHEIGHT_MSA(32) + +// 16x16 +AOM_SAD_16xHEIGHT_MSA(16) +AOM_SAD_16xHEIGHTx3_MSA(16) +AOM_SAD_16xHEIGHTx8_MSA(16) +AOM_SAD_16xHEIGHTx4D_MSA(16) +AOM_AVGSAD_16xHEIGHT_MSA(16) + +// 16x8 +AOM_SAD_16xHEIGHT_MSA(8) +AOM_SAD_16xHEIGHTx3_MSA(8) +AOM_SAD_16xHEIGHTx8_MSA(8) +AOM_SAD_16xHEIGHTx4D_MSA(8) +AOM_AVGSAD_16xHEIGHT_MSA(8) + +// 8x16 +AOM_SAD_8xHEIGHT_MSA(16) +AOM_SAD_8xHEIGHTx3_MSA(16) +AOM_SAD_8xHEIGHTx8_MSA(16) +AOM_SAD_8xHEIGHTx4D_MSA(16) +AOM_AVGSAD_8xHEIGHT_MSA(16) + +// 8x8 +AOM_SAD_8xHEIGHT_MSA(8) +AOM_SAD_8xHEIGHTx3_MSA(8) +AOM_SAD_8xHEIGHTx8_MSA(8) +AOM_SAD_8xHEIGHTx4D_MSA(8) +AOM_AVGSAD_8xHEIGHT_MSA(8) + +// 8x4 +AOM_SAD_8xHEIGHT_MSA(4) +AOM_SAD_8xHEIGHTx3_MSA(4) +AOM_SAD_8xHEIGHTx8_MSA(4) +AOM_SAD_8xHEIGHTx4D_MSA(4) +AOM_AVGSAD_8xHEIGHT_MSA(4) + +// 4x8 +AOM_SAD_4xHEIGHT_MSA(8) +AOM_SAD_4xHEIGHTx3_MSA(8) +AOM_SAD_4xHEIGHTx8_MSA(8) +AOM_SAD_4xHEIGHTx4D_MSA(8) +AOM_AVGSAD_4xHEIGHT_MSA(8) + +// 4x4 +AOM_SAD_4xHEIGHT_MSA(4) +AOM_SAD_4xHEIGHTx3_MSA(4) +AOM_SAD_4xHEIGHTx8_MSA(4) +AOM_SAD_4xHEIGHTx4D_MSA(4) +AOM_AVGSAD_4xHEIGHT_MSA(4) + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c new file mode 100644 index 000000000..3eb85107d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -0,0 +1,1795 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/variance.h" + +static const uint8_t bilinear_filters_msa[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 pred, src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref, pred; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + CALC_MSE_AVG_B(src0, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 filt0, out, ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4, out; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 ref = { 0 }; + v16u8 src2110, src4332; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out, ref = { 0 }; + v16u8 filt_vt, filt_hz, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt_vt, filt_hz, vec0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 out, pred, filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 out, pred, filt0; + v16u8 ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 pred0, pred1, pred2, pred3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 out, pred, ref = { 0 }; + v16u8 src2110, src4332, filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, filt0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 out, pred, ref = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 pred0, pred1, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 out0, out1, out2, out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) +/* clang-format on */ + +#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ + } \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + int32_t xoffset, int32_t yoffset, + const uint8_t *ref_ptr, + int32_t ref_stride, uint32_t *sse, + const uint8_t *sec_pred) { + int32_t diff; + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + + if (yoffset) { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); + } else { + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); + } + } else { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); + } else { + *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, + sec_pred, &diff); + } + } + + return VARIANCE_32Wx64H(*sse, diff); +} + +#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) +/* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c new file mode 100644 index 000000000..37b89765d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/subtract_msa.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h new file mode 100644 index 000000000..cba5d4445 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + } + +#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ + dst1, dst2, dst3) \ + { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ + tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ + tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ + dst1, dst2, dst3); \ + } + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ + ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ + }) + +#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ + { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ + madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ + } + +#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ + cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ + } +#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c new file mode 100644 index 000000000..745fdfc9c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/variance_msa.c @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + int32_t ht_cnt; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t get_mb_ss_msa(const int16_t *src) { + uint32_t sum, cnt; + v8i16 src0, src1, src2, src3; + v4i32 src0_l, src1_l, src2_l, src3_l; + v4i32 src0_r, src1_r, src2_r, src3_r; + v2i64 sq_src_l = { 0 }; + v2i64 sq_src_r = { 0 }; + + for (cnt = 8; cnt--;) { + LD_SH4(src, 8, src0, src1, src2, src3); + src += 4 * 8; + + UNPCK_SH_SW(src0, src0_l, src0_r); + UNPCK_SH_SW(src1, src1_l, src1_r); + UNPCK_SH_SW(src2, src2_l, src2_r); + UNPCK_SH_SW(src3, src3_l, src3_r); + + DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); + } + + sq_src_l += __msa_splati_d(sq_src_l, 1); + sq_src_r += __msa_splati_d(sq_src_r, 1); + + sum = __msa_copy_s_d(sq_src_l, 0); + sum += __msa_copy_s_d(sq_src_r, 0); + + return sum; +} + +static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + } + + return HADD_SW_S32(var); +} + +uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride) { + uint32_t err = 0; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v16u8 src_vec0, src_vec1; + v8i16 diff0, diff1; + v4i32 err0 = { 0 }; + v4i32 err1 = { 0 }; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); + ILVRL_B2_UB(src, ref, src_vec0, src_vec1); + HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); + DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); + err = HADD_SW_S32(err0); + err += HADD_SW_S32(err1); + + return err; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_VARIANCE_WDXHT_MSA(4, 4) +AOM_VARIANCE_WDXHT_MSA(4, 8) + +AOM_VARIANCE_WDXHT_MSA(8, 4) +AOM_VARIANCE_WDXHT_MSA(8, 8) +AOM_VARIANCE_WDXHT_MSA(8, 16) + +AOM_VARIANCE_WDXHT_MSA(16, 8) +AOM_VARIANCE_WDXHT_MSA(16, 16) +AOM_VARIANCE_WDXHT_MSA(16, 32) + +AOM_VARIANCE_WDXHT_MSA(32, 16) +AOM_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_32Wx64H(*sse, diff); +} + +uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx32H(*sse, diff); +} + +uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); +} + +void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); +} + +uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } |