diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:15 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 21:52:20 -0500 |
commit | bbcc64772580c8a979288791afa02d30bc476d2e (patch) | |
tree | 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/aom_dsp/mips | |
parent | 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff) | |
download | UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip |
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/aom_dsp/mips')
50 files changed, 67 insertions, 15573 deletions
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c index 4c6e201e1..96d04cff0 100644 --- a/third_party/aom/aom_dsp/mips/add_noise_msa.c +++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c @@ -10,7 +10,8 @@ */ #include <stdlib.h> -#include "./macros_msa.h" + +#include "aom_dsp/mips/macros_msa.h" void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, char blackclamp[16], char whiteclamp[16], diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c deleted file mode 100644 index 847394a3d..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c +++ /dev/null @@ -1,704 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, res2, res3; - v16u8 mask0, mask1, mask2, mask3; - v8i16 filt, res0, res1; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, res0, res1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - SRARI_H2_SH(res0, res1, FILTER_BITS); - SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - XORI_B2_128_UB(res2, res3); - AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8i16 filt, vec0, vec1, vec2, vec3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, vec0, vec1); - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, vec2, vec3); - SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); - SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); - PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, - res3); - ILVR_D2_UB(res1, res0, res3, res2, res0, res2); - XORI_B2_128_UB(res0, res2); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); - AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); - ST4x8_UB(res0, res2, dst, dst_stride); -} - -static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height >> 1; loop_cnt--;) { - LD_SB2(src, src_stride, src0, src2); - LD_SB2(src + 8, src_stride, src1, src3); - src += (2 * src_stride); - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, - vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - LD_UB2(dst, dst_stride, dst0, dst1); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); - dst += dst_stride; - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); - dst += dst_stride; - } -} - -static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst1, dst2, mask0, mask1, mask2, mask3; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, - vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - LD_UB2(dst, 16, dst1, dst2); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst1, dst2, mask0, mask1, mask2, mask3; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - for (cnt = 0; cnt < 2; ++cnt) { - src0 = LD_SB(&src[cnt << 5]); - src2 = LD_SB(&src[16 + (cnt << 5)]); - src3 = LD_SB(&src[24 + (cnt << 5)]); - src1 = __msa_sldi_b(src2, src0, 8); - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, - vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, - vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, - vec1, vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, - vec1, vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - LD_UB2(&dst[cnt << 5], 16, dst1, dst2); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); - } - - src += src_stride; - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; - v8u16 vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); - SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8u16 vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, - vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); -} - -static void common_hz_2t_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - if (16 == height) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - } -} - -static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, - filter, height); - } -} - -static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res3, res2, dst1, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res7, res6, dst3, dst); - dst += dst_stride; - - for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res3, res2, dst1, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res7, res6, dst3, dst); - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - src4 = LD_SB(src); - src6 = LD_SB(src + 16); - src7 = LD_SB(src + 24); - src5 = __msa_sldi_b(src6, src4, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB2(dst, 16, dst0, dst1); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); - dst += dst_stride; - LD_UB2(dst, 16, dst2, dst3); - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height; loop_cnt--;) { - LD_SB4(src, 16, src0, src2, src4, src6); - src7 = LD_SB(src + 56); - SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - LD_UB4(dst, 16, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(out1, out0, dst0, dst); - PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); - PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); - PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); - dst += dst_stride; - } -} - -void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8]; - - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0) { - switch (w) { - case 4: - common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 8: - common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 16: - common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 32: - common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 64: - common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 8: - common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 16: - common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 32: - common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 64: - common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c deleted file mode 100644 index bed600d5b..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c +++ /dev/null @@ -1,605 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hv_8ht_8vt_and_aver_dst_4w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); - vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); - vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - SRARI_H2_SH(res0, res1, FILTER_BITS); - SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); - XORI_B2_128_UB(tmp0, tmp1); - AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out5 = hz_out9; - vec0 = vec2; - vec1 = vec3; - vec2 = vec4; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_8w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); - ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); - tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); - out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); - tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - hz_out6 = hz_out10; - out0 = out2; - out1 = out3; - out2 = out8; - out4 = out6; - out5 = out7; - out6 = out9; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_16w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_32w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_64w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 8; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 dst0, dst1, dst2, dst3, res0, res1; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); - SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, - hz_out3, hz_out5, 8); - hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, - tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, - res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_4w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } else if (8 == height) { - common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - src += (5 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec1, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec2, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec3, filt_vt); - - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_SB(src); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_8w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } else { - common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( - src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_16w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); - dst += dst_stride; - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); - dst += dst_stride; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_32w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 16; - dst += 16; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_64w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 16; - dst += 16; - } -} - -void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8], filt_ver[8]; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], h); - break; - case 8: - common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], h); - break; - case 16: - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - case 32: - common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - case 64: - common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - default: - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - } else { - switch (w) { - case 4: - common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 8: - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 16: - common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 32: - common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 64: - common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - default: - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c deleted file mode 100644 index dae771104..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c +++ /dev/null @@ -1,677 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, out; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; - v16i8 src10998, filt0, filt1, filt2, filt3; - v8i16 filt, out10, out32; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, - src4332, src6554); - XORI_B3_128_SB(src2110, src4332, src6554); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); - XORI_B2_128_SB(src8776, src10998); - out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, - filt1, filt2, filt3); - out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, - filt1, filt2, filt3); - SRARI_H2_SH(out10, out32, FILTER_BITS); - SAT_SH2_SH(out10, out32, 7); - out = PCKEV_XORI128_UB(out10, out32); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); - out = __msa_aver_u_b(out, dst0); - - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - src2110 = src6554; - src4332 = src8776; - src6554 = src10998; - src6 = src10; - } -} - -static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; - v8i16 filt, out0, out1, out2, out3; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - XORI_B4_128_SB(src7, src8, src9, src10); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, - filt2, filt3); - out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, - filt2, filt3); - out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, - filt2, filt3); - out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src6 = src10; - } -} - -static void common_vt_8t_and_aver_dst_16w_mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height, int32_t width) { - const uint8_t *src_tmp; - uint8_t *dst_tmp; - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16i8 filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src_tmp += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); - src_tmp += (4 * src_stride); - - LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); - XORI_B4_128_SB(src7, src8, src9, src10); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, - out3_r, tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, - dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); - dst_tmp += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } - - src += 16; - dst += 16; - } -} - -static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 16); -} - -static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 32); -} - -static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 64); -} - -static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; - v16i8 src10_r, src32_r, src21_r, src43_r; - v8i16 filt; - v8u16 tmp0, tmp1; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - src4 = LD_SB(src); - src += src_stride; - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, dst0); - - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; - v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; - v16u8 src2110, src4332, src6554, src8776, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, - dst3); - ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, - src76_r, src87_r); - ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, - src76_r, src2110, src4332, src6554, src8776); - DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); - AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16u8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); - ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); -} - -static void common_vt_2t_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); - src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); - - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, - vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, - dst_stride); - dst += (4 * dst_stride); - - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, - dst_stride); - dst += (4 * dst_stride); - - src0 = src8; - } -} - -static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, - filter, height); - } -} - -static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3, filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - dst += dst_stride; - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); - dst += dst_stride; - - src0 = src4; - } -} - -static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3, filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_UB2(src, 16, src0, src5); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - - LD_UB4(src + 16, src_stride, src6, src7, src8, src9); - LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); - src += (4 * src_stride); - - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); - - ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); - ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); - - ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); - ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); - dst += (4 * dst_stride); - - src0 = src4; - src5 = src9; - } -} - -static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5; - v16u8 src6, src7, src8, src9, src10, src11, filt0; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8u16 filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_UB4(src, 16, src0, src3, src6, src9); - src += src_stride; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_UB2(src, src_stride, src1, src2); - LD_UB2(dst, dst_stride, dst0, dst1); - LD_UB2(src + 16, src_stride, src4, src5); - LD_UB2(dst + 16, dst_stride, dst2, dst3); - LD_UB2(src + 32, src_stride, src7, src8); - LD_UB2(dst + 32, dst_stride, dst4, dst5); - LD_UB2(src + 48, src_stride, src10, src11); - LD_UB2(dst + 48, dst_stride, dst6, dst7); - src += (2 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); - - ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); - ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); - - ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); - ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); - - ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); - ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); - dst += (2 * dst_stride); - - src0 = src2; - src3 = src5; - src6 = src8; - src9 = src11; - } -} - -void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 8: - common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 16: - common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 32: - common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 64: - common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 8: - common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 16: - common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - - break; - case 32: - common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 64: - common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c index fc3a823c5..363fad308 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -10,7 +10,9 @@ */ #include <assert.h> -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/aom_convolve_msa.h" static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c deleted file mode 100644 index a4d594931..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c +++ /dev/null @@ -1,630 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -const uint8_t mc_filt_mask_arr[16 * 3] = { - /* 8 width cases */ - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - /* 4 width cases */ - 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, - /* 4 width cases */ - 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 -}; - -static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); - out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); - SAT_SH2_SH(tmp0, tmp1, 7); - out = PCKEV_XORI128_UB(tmp0, tmp1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out5 = hz_out9; - out0 = out2; - out1 = out3; - out2 = out4; - } -} - -static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v16u8 mask0, mask1, mask2, mask3, vec0, vec1; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); - ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - XORI_B4_128_SB(src7, src8, src9, src10); - - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); - tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); - out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); - tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - vec0 = PCKEV_XORI128_UB(tmp0, tmp1); - vec1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(vec0, vec1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out6 = hz_out10; - out0 = out2; - out1 = out3; - out2 = out8; - out4 = out6; - out5 = out7; - out6 = out9; - } -} - -static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 8; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16i8 res0, res1, res2, res3; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); - SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, - hz_out3, hz_out5, 8); - hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, - vec5, vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } else if (8 == height) { - common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } -} - -static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask, out0, out1; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec1, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec2, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec3, filt_vt); - - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, mask, out0, out1; - v16u8 filt_hz, filt_vt, vec0; - v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_SB(src); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp4 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); - PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp5 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp6 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp7 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp8 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); - PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } else { - common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - } -} - -static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt_hz, filt_vt, vec0, vec1; - v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - } -} - -static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 16; - dst += 16; - } -} - -static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 16; - dst += 16; - } -} - -void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, - int32_t y_step_q4, int32_t w, int32_t h) { - int8_t cnt, filt_hor[8], filt_ver[8]; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 8: - common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 16: - common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 32: - common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 64: - common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - default: - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - break; - } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - } else { - switch (w) { - case 4: - common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 8: - common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 16: - common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 32: - common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 64: - common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - default: - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c index f7bdfc2bd..aa962b41f 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -10,7 +10,9 @@ */ #include <assert.h> -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/aom_convolve_msa.h" static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c deleted file mode 100644 index 75f8c7ea8..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/macros_msa.h" - -static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) { - int32_t cnt; - uint32_t out0, out1, out2, out3; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - - if (0 == (height % 4)) { - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - - out0 = __msa_copy_u_w((v4i32)dst0, 0); - out1 = __msa_copy_u_w((v4i32)dst1, 0); - out2 = __msa_copy_u_w((v4i32)dst2, 0); - out3 = __msa_copy_u_w((v4i32)dst3, 0); - SW4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == (height % 2)) { - for (cnt = (height / 2); cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - - LD_UB2(dst, dst_stride, dst0, dst1); - - AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); - - out0 = __msa_copy_u_w((v4i32)dst0, 0); - out1 = __msa_copy_u_w((v4i32)dst1, 0); - SW(out0, dst); - dst += dst_stride; - SW(out1, dst); - dst += dst_stride; - } - } -} - -static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) { - int32_t cnt; - uint64_t out0, out1, out2, out3; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - - out0 = __msa_copy_u_d((v2i64)dst0, 0); - out1 = __msa_copy_u_d((v2i64)dst1, 0); - out2 = __msa_copy_u_d((v2i64)dst2, 0); - out3 = __msa_copy_u_d((v2i64)dst3, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void avg_width16_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - - for (cnt = (height / 8); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); - dst += (8 * dst_stride); - } -} - -static void avg_width32_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint8_t *dst_dup = dst; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 src8, src9, src10, src11, src12, src13, src14, src15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - - for (cnt = (height / 8); cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 16, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); - LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); - dst_dup += (4 * dst_stride); - LD_UB4(src, src_stride, src8, src10, src12, src14); - LD_UB4(src + 16, src_stride, src9, src11, src13, src15); - src += (4 * src_stride); - LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); - LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); - dst_dup += (4 * dst_stride); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, - dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, - dst13, dst14, dst15); - - ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); - ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); - dst += (4 * dst_stride); - ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); - ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); - dst += (4 * dst_stride); - } -} - -static void avg_width64_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint8_t *dst_dup = dst; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 src8, src9, src10, src11, src12, src13, src14, src15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(src, 16, src4, src5, src6, src7); - src += src_stride; - LD_UB4(src, 16, src8, src9, src10, src11); - src += src_stride; - LD_UB4(src, 16, src12, src13, src14, src15); - src += src_stride; - - LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); - dst_dup += dst_stride; - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, - dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, - dst13, dst14, dst15); - - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += dst_stride; - ST_UB4(dst4, dst5, dst6, dst7, dst, 16); - dst += dst_stride; - ST_UB4(dst8, dst9, dst10, dst11, dst, 16); - dst += dst_stride; - ST_UB4(dst12, dst13, dst14, dst15, dst, 16); - dst += dst_stride; - } -} - -void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - - switch (w) { - case 4: { - avg_width4_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 8: { - avg_width8_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 16: { - avg_width16_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 32: { - avg_width32_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 64: { - avg_width64_msa(src, src_stride, dst, dst_stride, h); - break; - } - default: { - int32_t lp, cnt; - for (cnt = h; cnt--;) { - for (lp = 0; lp < w; ++lp) { - dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); - } - src += src_stride; - dst += dst_stride; - } - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h index 1a0ae4d8d..a0627c074 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -31,23 +31,6 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; tmp_dpadd_0; \ }) -#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ - filt_h1, filt_h2, filt_h3) \ - ({ \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8i16 hz_out_m; \ - \ - VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ - vec3_m); \ - hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ - filt_h1, filt_h2, filt_h3); \ - \ - hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) - #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, mask3, filt0, filt1, filt2, filt3, \ out0, out1) \ @@ -93,32 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; res7_m, out0, out1, out2, out3); \ } -#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ - { \ - v16u8 tmp_m; \ - \ - tmp_m = PCKEV_XORI128_UB(in1, in0); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ - } - -#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ - { \ - v16u8 tmp_m; \ - \ - tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ - } - -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ - stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ - } #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h index 31159fdcd..d51bfa899 100644 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -13,7 +13,9 @@ #define AOM_COMMON_MIPS_DSPR2_H_ #include <assert.h> -#include "./aom_config.h" + +#include "config/aom_config.h" + #include "aom/aom_integer.h" #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c deleted file mode 100644 index d557115b9..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(y_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, - w, h); - break; - case 64: - prefetch_store(dst + 32); - convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, - h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c deleted file mode 100644 index efbdcf60f..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c +++ /dev/null @@ -1,802 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3; - uint32_t tn1, tn2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p3], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ - - /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - - "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=&r"(Temp4) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3, tp4; - uint32_t p1, p2, p3, p4, n1; - uint32_t st0, st1; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - "lbu %[Temp2], 0(%[dst]) \n\t" - "lbu %[tp4], 2(%[dst]) \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" - "sb %[Temp2], 0(%[dst]) \n\t" - "sb %[tp4], 2(%[dst]) \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "lbu %[Temp2], 4(%[dst]) \n\t" - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[Temp2], 4(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp1], 6(%[dst]) \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - "lbu %[tp2], 1(%[dst]) \n\t" - "lbu %[tp3], 3(%[dst]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp4], 5(%[dst]) \n\t" - - /* odd 4. pixel */ - "sb %[tp2], 1(%[dst]) \n\t" - "sb %[tp1], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbu %[tp1], 7(%[dst]) \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" - - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" - - "lbux %[p1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" - - /* store bytes */ - "sb %[tp3], 3(%[dst]) \n\t" - "sb %[tp4], 5(%[dst]) \n\t" - "sb %[tp1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - uint32_t pos = 38; - - assert(x_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 8: - convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 16: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 1); - break; - case 32: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c index 066308315..08bf1ab30 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c @@ -12,7 +12,8 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c index dc51ab1cb..2a8f75938 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -12,7 +12,8 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c index 3367be01a..ac87936da 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -12,7 +12,8 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c deleted file mode 100644 index 3574da19f..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c +++ /dev/null @@ -1,646 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, - h); - break; - case 64: - prefetch_store(dst + 32); - convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, - h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} - -void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - /* Fixed size intermediate buffer places limits on parameters. */ - DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); - int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; - - assert(w <= 64); - assert(h <= 64); - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - - if (intermediate_height < h) intermediate_height = h; - - aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); - - aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - int x, y; - uint32_t tp1, tp2, tn1; - uint32_t tp3, tp4, tn2; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - /* 1 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - - : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 8: - /* 2 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 16: - /* 4 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 32: - /* 8 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 16(%[src]) \n\t" - "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - "ulw %[tp3], 20(%[src]) \n\t" - "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 24(%[src]) \n\t" - "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ - "ulw %[tp3], 28(%[src]) \n\t" - "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - /* 16 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_load(src + src_stride + 64); - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 16(%[src]) \n\t" - "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - "ulw %[tp3], 20(%[src]) \n\t" - "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 24(%[src]) \n\t" - "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ - "ulw %[tp3], 28(%[src]) \n\t" - "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 32(%[src]) \n\t" - "ulw %[tp2], 32(%[dst]) \n\t" - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ - "ulw %[tp3], 36(%[src]) \n\t" - "ulw %[tp4], 36(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 40(%[src]) \n\t" - "ulw %[tp2], 40(%[dst]) \n\t" - "sw %[tn1], 32(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 36(%[dst]) \n\t" /* store */ - "ulw %[tp3], 44(%[src]) \n\t" - "ulw %[tp4], 44(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 48(%[src]) \n\t" - "ulw %[tp2], 48(%[dst]) \n\t" - "sw %[tn1], 40(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 44(%[dst]) \n\t" /* store */ - "ulw %[tp3], 52(%[src]) \n\t" - "ulw %[tp4], 52(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 56(%[src]) \n\t" - "ulw %[tp2], 56(%[dst]) \n\t" - "sw %[tn1], 48(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 52(%[dst]) \n\t" /* store */ - "ulw %[tp3], 60(%[src]) \n\t" - "ulw %[tp4], 60(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 56(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - default: - for (y = h; y > 0; --y) { - for (x = 0; x < w; ++x) { - dst[x] = (dst[x] + src[x] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c deleted file mode 100644 index f6534b420..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c +++ /dev/null @@ -1,998 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t n1, n2, n3, n4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ - "preceu.ph.qbr %[n1], %[tp2] \n\t" - "preceu.ph.qbl %[n2], %[tp2] \n\t" - "preceu.ph.qbr %[n3], %[tn2] \n\t" - "preceu.ph.qbl %[n4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[n1], %[tn1] \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ - "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ - - /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - - "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), - [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4, n1; - uint32_t tn1, tn2, tn3; - uint32_t st0, st1; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - "lbu %[Temp2], 0(%[dst]) \n\t" - "lbu %[tn3], 2(%[dst]) \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "preceu.ph.qbl %[n1], %[tn2] \n\t" - "ulw %[tn1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tn1] \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" - "sb %[Temp2], 0(%[dst]) \n\t" - "sb %[tn3], 2(%[dst]) \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "balign %[tn3], %[tn1], 3 \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "lbu %[Temp2], 4(%[dst]) \n\t" - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[Temp2], 4(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp1], 6(%[dst]) \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "preceu.ph.qbl %[n1], %[tn1] \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - "lbu %[tp2], 1(%[dst]) \n\t" - "lbu %[tn2], 3(%[dst]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[tn3] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn3], 5(%[dst]) \n\t" - - /* odd 4. pixel */ - "sb %[tp2], 1(%[dst]) \n\t" - "sb %[tp1], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbu %[tn1], 7(%[dst]) \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" - - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" - - "lbux %[n1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" - - /* store bytes */ - "sb %[tn2], 3(%[dst]) \n\t" - "sb %[tn3], 5(%[dst]) \n\t" - "sb %[tn1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), - [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "ulw %[qload2], 16(%[src]) \n\t" - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "ulw %[qload2], 16(%[src]) \n\t" - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - src -= 3; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 8: - convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 16: - convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 1); - break; - case 32: - convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - default: - aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); - break; - } - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c index dd4bc821a..af54b4264 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -12,1389 +12,14 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #if HAVE_DSPR2 -static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - dst_ptr = dst; - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[tn1], %[Temp2](%[cm]) \n\t" - "lbux %[p2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tn1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), - [dst_stride] "r"(dst_stride)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4, n1; - uint8_t *odd_dst; - uint32_t dst_pitch_2 = (dst_stride << 1); - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - dst_ptr = dst; - odd_dst = (dst_ptr + dst_stride); - - __asm__ __volatile__( - "ulw %[tp2], 0(%[src]) \n\t" - "ulw %[tp1], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp1] \n\t" - "preceu.ph.qbl %[p4], %[tp1] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tp3] \n\t" - "preceu.ph.qbl %[n1], %[tp3] \n\t" - "ulw %[tp2], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tp2] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "lbux %[tp3], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[p3], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[Temp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "sb %[tp3], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "ulw %[tp1], 1(%[src]) \n\t" - "ulw %[tp3], 5(%[src]) \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[tp2], %[p3](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "ulw %[tp2], 9(%[src]) \n\t" - - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[n1], %[tp2] \n\t" - "ulw %[Temp1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[tp3], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[Temp1] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[tp3], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[n1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p2], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[n1], 0(%[odd_dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_horiz_16_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "ulw %[qload2], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "ulw %[qload1], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 16(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 17(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - - dst_ptr += 1; - } -} - -static void convolve_horiz_64_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "ulw %[qload2], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "ulw %[qload1], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 16(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 17(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - - dst_ptr += 1; - } -} - -void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, int w, int h) { - int x, y, k; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - int sum = 0; - - for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; - - dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - } - - src += src_stride; - dst += 1; - } -} - -void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { - int x, y; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x * dst_stride] = src[x]; - } - - src += src_stride; - dst += 1; - } -} - -void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); - int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; - uint32_t pos = 38; - - (void)x_step_q4; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - if (intermediate_height < h) intermediate_height = h; - - /* copy the src to dst */ - if (filter_x[3] == 0x80) { - copy_horiz_transposed(src - src_stride * 3, src_stride, temp, - intermediate_height, w, intermediate_height); - } else if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp, - intermediate_height, filter_x, w, intermediate_height); - } else { - src -= (src_stride * 3 + 3); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - - switch (w) { - case 4: - convolve_horiz_4_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - case 8: - convolve_horiz_8_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - case 16: - case 32: - convolve_horiz_16_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height, (w / 16)); - break; - case 64: - prefetch_load(src + 32); - convolve_horiz_64_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - default: - convolve_horiz_transposed(src, src_stride, temp, intermediate_height, - filter_x, w, intermediate_height); - break; - } - } - - /* copy the src to dst */ - if (filter_y[3] == 0x80) { - copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); - } else if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, - filter_y, h, w); - } else { - switch (h) { - case 4: - convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - case 8: - convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - case 16: - case 32: - convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w, (h / 16)); - break; - case 64: - convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - default: - convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, - filter_y, h, w); - break; - } - } -} - void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c index c60557617..f9c6879ab 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -12,7 +12,8 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c index d8a90b6ab..201e66427 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c @@ -12,7 +12,8 @@ #include <assert.h> #include <stdio.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h index f8fd9e2b6..e7b8d531b 100644 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -14,7 +14,8 @@ #include <assert.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" @@ -29,18 +30,6 @@ void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, int h); diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c deleted file mode 100644 index 43dce8ba6..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c +++ /dev/null @@ -1,928 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/fwd_txfm_msa.h" - -static void fdct8x32_1d_column_load_butterfly(const int16_t *input, - int32_t src_stride, - int16_t *temp_buff) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 step0, step1, step2, step3; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - v8i16 step0_1, step1_1, step2_1, step3_1; - - /* 1st and 2nd set */ - LD_SH4(input, src_stride, in0, in1, in2, in3); - LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, - step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, - step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff, 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); - - /* 3rd and 4th set */ - LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); - LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, - step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, - step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); -} - -static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 temp0, temp1; - - /* fdct even */ - LD_SH4(input, 8, in0, in1, in2, in3); - LD_SH4(input + 96, 8, in12, in13, in14, in15); - BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, - vec3, in12, in13, in14, in15); - LD_SH4(input + 32, 8, in4, in5, in6, in7); - LD_SH4(input + 64, 8, in8, in9, in10, in11); - BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, - in8, in9, in10, in11); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp); - ST_SH(temp1, temp + 512); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 256); - ST_SH(temp1, temp + 768); - - SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 128); - ST_SH(temp1, temp + 896); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 640); - ST_SH(temp1, temp + 384); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 64); - ST_SH(temp1, temp + 960); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 576); - ST_SH(temp1, temp + 448); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 320); - ST_SH(temp1, temp + 704); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 192); - ST_SH(temp1, temp + 832); -} - -static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(input + 32); - in21 = LD_SH(input + 40); - in26 = LD_SH(input + 80); - in27 = LD_SH(input + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(input + 16); - in19 = LD_SH(input + 24); - in28 = LD_SH(input + 96); - in29 = LD_SH(input + 104); - - vec4 = in19 - in20; - ST_SH(vec4, input + 32); - vec4 = in18 - in21; - ST_SH(vec4, input + 40); - vec4 = in29 - in26; - ST_SH(vec4, input + 80); - vec4 = in28 - in27; - ST_SH(vec4, input + 88); - - in21 = in18 + in21; - in20 = in19 + in20; - in27 = in28 + in27; - in26 = in29 + in26; - - LD_SH4(input + 48, 8, in22, in23, in24, in25); - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(input); - in17 = LD_SH(input + 8); - in30 = LD_SH(input + 112); - in31 = LD_SH(input + 120); - - vec4 = in17 - in22; - ST_SH(vec4, input + 16); - vec4 = in16 - in23; - ST_SH(vec4, input + 24); - vec4 = in31 - in24; - ST_SH(vec4, input + 96); - vec4 = in30 - in25; - ST_SH(vec4, input + 104); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr); - ST_SH(vec4, temp_ptr + 960); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 448); - ST_SH(vec4, temp_ptr + 512); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 704); - ST_SH(vec5, temp_ptr + 256); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 192); - ST_SH(vec5, temp_ptr + 768); - - LD_SH4(input + 16, 8, in22, in23, in20, in21); - LD_SH4(input + 80, 8, in26, in27, in24, in25); - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 832); - ST_SH(vec4, temp_ptr + 128); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 320); - ST_SH(vec4, temp_ptr + 640); - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 576); - ST_SH(vec4, temp_ptr + 384); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 64); - ST_SH(vec4, temp_ptr + 896); -} - -static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, - int16_t *tmp_buf, int16_t *tmp_buf_big) { - fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); - fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); - fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); -} - -static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, - int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 step0, step1, step2, step3, step4, step5, step6, step7; - - LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, - step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); - - /* 2nd set */ - LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, - step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, - (output + 8 * 8), 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); -} - -static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, - int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; - v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; - v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); - - /* Stage 3 */ - UNPCK_SH_SW(vec0, vec0_l, vec0_r); - UNPCK_SH_SW(vec1, vec1_l, vec1_r); - UNPCK_SH_SW(vec2, vec2_l, vec2_r); - UNPCK_SH_SW(vec3, vec3_l, vec3_r); - UNPCK_SH_SW(vec4, vec4_l, vec4_r); - UNPCK_SH_SW(vec5, vec5_l, vec5_r); - UNPCK_SH_SW(vec6, vec6_l, vec6_r); - UNPCK_SH_SW(vec7, vec7_l, vec7_r); - ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, - tmp1_w, tmp2_w, tmp3_w); - BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); - ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, - vec1_r, vec2_r, vec3_r); - - tmp3_w = vec0_r + vec3_r; - vec0_r = vec0_r - vec3_r; - vec3_r = vec1_r + vec2_r; - vec1_r = vec1_r - vec2_r; - - DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, - vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out, 8); - - DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, - vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out + 16, 8); - - LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 32); - ST_SH(in5, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 40); - ST_SH(in5, out + 48); - - LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 64); - ST_SH(in5, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 72); - ST_SH(in5, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 80); - ST_SH(in5, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 96); - ST_SH(in5, out + 88); -} - -static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - - /* 1st set */ - in0 = LD_SH(temp); - in4 = LD_SH(temp + 32); - in2 = LD_SH(temp + 64); - in6 = LD_SH(temp + 96); - in1 = LD_SH(temp + 128); - in7 = LD_SH(temp + 152); - in3 = LD_SH(temp + 192); - in5 = LD_SH(temp + 216); - - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - /* 2nd set */ - in0_1 = LD_SH(temp + 16); - in1_1 = LD_SH(temp + 232); - in2_1 = LD_SH(temp + 80); - in3_1 = LD_SH(temp + 168); - in4_1 = LD_SH(temp + 48); - in5_1 = LD_SH(temp + 176); - in6_1 = LD_SH(temp + 112); - in7_1 = LD_SH(temp + 240); - - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - - /* 3rd set */ - in0 = LD_SH(temp + 8); - in1 = LD_SH(temp + 136); - in2 = LD_SH(temp + 72); - in3 = LD_SH(temp + 200); - in4 = LD_SH(temp + 40); - in5 = LD_SH(temp + 208); - in6 = LD_SH(temp + 104); - in7 = LD_SH(temp + 144); - - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, - 32); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); - - /* 4th set */ - in0_1 = LD_SH(temp + 24); - in1_1 = LD_SH(temp + 224); - in2_1 = LD_SH(temp + 88); - in3_1 = LD_SH(temp + 160); - in4_1 = LD_SH(temp + 56); - in5_1 = LD_SH(temp + 184); - in6_1 = LD_SH(temp + 120); - in7_1 = LD_SH(temp + 248); - - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, - 32); -} - -static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { - fdct8x32_1d_row_load_butterfly(temp, temp_buf); - fdct8x32_1d_row_even(temp_buf, temp_buf); - fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); - fdct8x32_1d_row_transpose_store(temp_buf, output); -} - -static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); - fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void aom_fdct32x32_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, - tmp_buf_big + (8 * i)); - } - - /* row transform */ - fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); - - /* row transform */ - for (i = 1; i < 4; ++i) { - fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); - } -} - -static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT_POSTPROC_2V_NEG_H(vec0, vec1); - FDCT_POSTPROC_2V_NEG_H(vec2, vec3); - FDCT_POSTPROC_2V_NEG_H(vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec6, vec7); - FDCT_POSTPROC_2V_NEG_H(in8, in9); - FDCT_POSTPROC_2V_NEG_H(in10, in11); - FDCT_POSTPROC_2V_NEG_H(in12, in13); - FDCT_POSTPROC_2V_NEG_H(in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - - temp0 = in0 + in3; - in0 = in0 - in3; - in3 = in1 + in2; - in1 = in1 - in2; - - DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31; - v8i16 vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - FDCT_POSTPROC_2V_NEG_H(in20, in21); - FDCT_POSTPROC_2V_NEG_H(in26, in27); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - FDCT_POSTPROC_2V_NEG_H(in18, in19); - FDCT_POSTPROC_2V_NEG_H(in28, in29); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - FDCT_POSTPROC_2V_NEG_H(in22, in23); - FDCT_POSTPROC_2V_NEG_H(in24, in25); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - FDCT_POSTPROC_2V_NEG_H(in16, in17); - FDCT_POSTPROC_2V_NEG_H(in30, in31); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - in16 = in28 + in29; - in19 = in31 + in30; - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); - fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], - &tmp_buf_big[0] + (8 * i)); - } - - /* row transform */ - for (i = 0; i < 4; ++i) { - fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], - out + (8 * i * 32)); - } -} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c deleted file mode 100644 index 7a285b7b8..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/fwd_txfm_msa.h" - -void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, - int32_t src_stride) { - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; - v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; - v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; - v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, - cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; - v8i16 coeff2 = { - -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 - }; - - LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, - in10, in11, in12, in13, in14, in15); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in8, in9, in10, in11, 2); - SLLI_4V(in12, in13, in14, in15, 2); - ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); - ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, - tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); - SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); - SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); - - tmp_ptr += 16; - - /* stp 1 */ - ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); - ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); - - cnst4 = __msa_splati_h(coeff, 0); - stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); - - cnst5 = __msa_splati_h(coeff, 1); - cnst5 = __msa_ilvev_h(cnst5, cnst4); - stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); - stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); - stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); - - /* stp2 */ - BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); - BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); - ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); - ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); - SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); - - cnst0 = __msa_splati_h(coeff, 4); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); - - BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); - ILVRL_H2_SH(in15, in8, vec1, vec0); - SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr); - - cnst0 = __msa_splati_h(coeff2, 0); - cnst0 = __msa_ilvev_h(cnst1, cnst0); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 224); - - ILVRL_H2_SH(in14, in9, vec1, vec0); - SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); - ST_SH(in8, tmp_ptr + 128); - - cnst1 = __msa_splati_h(coeff2, 2); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 96); - - SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); - - cnst1 = __msa_splati_h(coeff, 3); - cnst1 = __msa_ilvev_h(cnst0, cnst1); - stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); - - /* stp4 */ - ADD2(stp34, stp25, stp33, stp22, in13, in10); - - ILVRL_H2_SH(in13, in10, vec1, vec0); - SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 64); - - cnst0 = __msa_splati_h(coeff2, 1); - cnst0 = __msa_ilvev_h(cnst1, cnst0); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 160); - - SUB2(stp34, stp25, stp33, stp22, in12, in11); - ILVRL_H2_SH(in12, in11, vec1, vec0); - SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); - ST_SH(in8, tmp_ptr + 192); - - cnst1 = __msa_splati_h(coeff2, 3); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 32); -} - -void fdct16x8_1d_row(int16_t *input, int16_t *output) { - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - - LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); - ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); - ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); - ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); - SRA_4V(in0, in1, in2, in3, 2); - SRA_4V(in4, in5, in6, in7, 2); - SRA_4V(in8, in9, in10, in11, 2); - SRA_4V(in12, in13, in14, in15, 2); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, - tmp7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, - tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, - tmp1, in1, tmp2, in2, tmp3, in3); - ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); - TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, - tmp5, in5, tmp6, in6, tmp7, in7); - ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); -} - -void aom_fdct4x4_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - v8i16 in0, in1, in2, in3; - - LD_SH4(input, src_stride, in0, in1, in2, in3); - - /* fdct4 pre-process */ - { - v8i16 vec, mask; - v16i8 zero = { 0 }; - v16i8 one = __msa_ldi_b(1); - - mask = (v8i16)__msa_sldi_b(zero, one, 15); - SLLI_4V(in0, in1, in2, in3, 4); - vec = __msa_ceqi_h(in0, 0); - vec = vec ^ 255; - vec = mask & vec; - in0 += vec; - } - - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); - SRA_4V(in0, in1, in2, in3, 2); - PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); - ST_SH2(in0, in2, output, 8); -} - -void aom_fdct8x8_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - - LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); -} - -void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); - out[1] = 0; -} - -void aom_fdct16x16_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); - - /* column transform */ - for (i = 0; i < 2; ++i) { - fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); - } - - /* row transform */ - for (i = 0; i < 2; ++i) { - fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); - } -} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h deleted file mode 100644 index ada25dffd..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_ -#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_ - -#include "aom_dsp/mips/txfm_macros_msa.h" -#include "aom_dsp/txfm_common.h" - -#define LD_HADD(psrc, stride) \ - ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ - in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ - }) - -#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ - v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 coeff_m = { \ - cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ - }; \ - \ - BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ - ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ - cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ - vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ - cnst2_m = __msa_splati_h(coeff_m, 2); \ - cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ - vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ - vec7_m, out0, out2, out1, out3); \ - } - -#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ - { \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ - SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ - AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ - in2, in3); \ - AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ - in6, in7); \ - } - -#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ - v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ - \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ - s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ - \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ - \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ - \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - } - -#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v8i16 x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ - \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ - s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ - \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ - \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ - \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - } - -#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ - input7, out1, out3, out5, out7, out9, out11, out13, \ - out15) \ - { \ - v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ - v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ - v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ - v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ - v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ - v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ - cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ - v8i16 coeff2_m = { \ - -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ - }; \ - \ - /* stp 1 */ \ - ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ - ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ - \ - cnst4_m = __msa_splati_h(coeff_m, 0); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ - \ - cnst5_m = __msa_splati_h(coeff_m, 1); \ - cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ - stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ - stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ - \ - /* stp2 */ \ - BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ - stp33_m); \ - BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ - stp34_m); \ - \ - ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ - ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 4); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 3); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - /* stp4 */ \ - BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ - vec5_m); \ - BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ - stp31_m); \ - \ - ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - \ - out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 0); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 2); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 1); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 3); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - } - -#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ - { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one_m = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clti_s_h(vec0, 0); \ - tp1_m = __msa_clti_s_h(vec1, 0); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one_m & tp0_m; \ - tp1_m = one_m & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ - } - -#define FDCT32_POSTPROC_NEG_W(vec) \ - { \ - v4i32 temp_m; \ - v4i32 one_m = __msa_ldi_w(1); \ - \ - temp_m = __msa_clti_s_w(vec, 0); \ - vec += 1; \ - temp_m = one_m & temp_m; \ - vec += temp_m; \ - vec >>= 2; \ - } - -#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ - { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clei_s_h(vec0, 0); \ - tp1_m = __msa_clei_s_h(vec1, 0); \ - tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ - tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one & tp0_m; \ - tp1_m = one & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ - } - -#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ - const0, const1, out0, out1, out2, out3) \ - { \ - v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ - v4i32 k0_m = __msa_fill_w((int32_t)const0); \ - \ - s0_m = __msa_fill_w((int32_t)const1); \ - k0_m = __msa_ilvev_w(s0_m, k0_m); \ - \ - ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ - ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ - ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ - ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ - \ - DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - \ - DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - } - -void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, - int32_t src_stride); -void fdct16x8_1d_row(int16_t *input, int16_t *output); -#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c deleted file mode 100644 index 0ea127f52..000000000 --- a/third_party/aom/aom_dsp/mips/idct16x16_msa.c +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { - v8i16 loc0, loc1, loc2, loc3; - v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; - v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; - v8i16 tmp5, tmp6, tmp7; - - LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - input += 8; - LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, - reg2, reg3, reg4, reg5, reg6, reg7); - TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, - reg9, reg10, reg11, reg12, reg13, reg14, reg15); - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); - SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, - reg8); - ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, - reg10); - - /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); - - reg9 = reg1 - loc2; - reg1 = reg1 + loc2; - reg7 = reg15 - loc3; - reg15 = reg15 + loc3; - - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); - - loc1 = reg15 + reg3; - reg3 = reg15 - reg3; - loc2 = reg2 + loc1; - reg15 = reg2 - loc1; - - loc1 = reg1 + reg13; - reg13 = reg1 - reg13; - loc0 = reg0 + loc1; - loc1 = reg0 - loc1; - tmp6 = loc0; - tmp7 = loc1; - reg0 = loc2; - - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); - - loc0 = reg9 + reg5; - reg5 = reg9 - reg5; - reg2 = reg6 + loc0; - reg1 = reg6 - loc0; - - loc0 = reg7 + reg11; - reg11 = reg7 - reg11; - loc1 = reg4 + loc0; - loc2 = reg4 - loc0; - tmp5 = loc1; - - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); - BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - - reg10 = loc0; - reg11 = loc1; - - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); - BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); - - reg13 = loc2; - - /* Transpose and store the output */ - reg12 = tmp5; - reg14 = tmp6; - reg3 = tmp7; - - /* transpose block */ - TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, - reg2, reg4, reg6, reg8, reg10, reg12, reg14); - ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); - - /* transpose block */ - TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, - reg13, reg11, reg5, reg7, reg9, reg1, reg15); - ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); -} - -void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 loc0, loc1, loc2, loc3; - v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; - v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; - v8i16 tmp5, tmp6, tmp7; - - /* load up 8x8 */ - LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - input += 8 * 16; - /* load bottom 8x8 */ - LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); - - reg0 = reg2 - loc1; - reg2 = reg2 + loc1; - reg12 = reg14 - loc0; - reg14 = reg14 + loc0; - reg4 = reg6 - loc3; - reg6 = reg6 + loc3; - reg8 = reg10 - loc2; - reg10 = reg10 + loc2; - - /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); - - reg9 = reg1 - loc2; - reg1 = reg1 + loc2; - reg7 = reg15 - loc3; - reg15 = reg15 + loc3; - - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); - - loc1 = reg15 + reg3; - reg3 = reg15 - reg3; - loc2 = reg2 + loc1; - reg15 = reg2 - loc1; - - loc1 = reg1 + reg13; - reg13 = reg1 - reg13; - loc0 = reg0 + loc1; - loc1 = reg0 - loc1; - tmp6 = loc0; - tmp7 = loc1; - reg0 = loc2; - - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); - - loc0 = reg9 + reg5; - reg5 = reg9 - reg5; - reg2 = reg6 + loc0; - reg1 = reg6 - loc0; - - loc0 = reg7 + reg11; - reg11 = reg7 - reg11; - loc1 = reg4 + loc0; - loc2 = reg4 - loc0; - tmp5 = loc1; - - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); - BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - - reg10 = loc0; - reg11 = loc1; - - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); - BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); - reg13 = loc2; - - /* Transpose and store the output */ - reg12 = tmp5; - reg14 = tmp6; - reg3 = tmp7; - - SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); - dst += (4 * dst_stride); - SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); - dst += (4 * dst_stride); - SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); - dst += (4 * dst_stride); - SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); -} - -void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); - int16_t *out = out_arr; - - /* transform rows */ - for (i = 0; i < 2; ++i) { - /* process 16 * 8 block */ - aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); - } - - /* transform columns */ - for (i = 0; i < 2; ++i) { - /* process 8 * 16 block */ - aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - uint8_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); - int16_t *out = out_arr; - - /* process 16 * 8 block */ - aom_idct16_1d_rows_msa(input, out); - - /* short case just considers top 4 rows as valid output */ - out += 4 * 16; - for (i = 12; i--;) { - __asm__ __volatile__( - "sw $zero, 0(%[out]) \n\t" - "sw $zero, 4(%[out]) \n\t" - "sw $zero, 8(%[out]) \n\t" - "sw $zero, 12(%[out]) \n\t" - "sw $zero, 16(%[out]) \n\t" - "sw $zero, 20(%[out]) \n\t" - "sw $zero, 24(%[out]) \n\t" - "sw $zero, 28(%[out]) \n\t" - - : - : [out] "r"(out)); - - out += 16; - } - - out = out_arr; - - /* transform columns */ - for (i = 0; i < 2; ++i) { - /* process 8 * 16 block */ - aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - uint8_t i; - int16_t out; - v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 6); - - vec = __msa_fill_h(out); - - for (i = 4; i--;) { - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - UNPCK_UB_SH(dst0, res0, res4); - UNPCK_UB_SH(dst1, res1, res5); - UNPCK_UB_SH(dst2, res2, res6); - UNPCK_UB_SH(dst3, res3, res7); - ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); - ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); - CLIP_SH4_0_255(res0, res1, res2, res3); - CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, - tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; - - /* load input data */ - LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, - l7, l15); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, - l7); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, - l12, l13, l14, l15); - - /* ADST in horizontal */ - AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, - l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, - r12, r13, r14, r15); - - l1 = -r8; - l3 = -r4; - l13 = -r13; - l15 = -r1; - - TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, - l6, l7); - ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); - TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, - l13, l14, l15); - ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); -} - -void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 v0, v2, v4, v6, k0, k1, k2, k3; - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 out8, out9, out10, out11, out12, out13, out14, out15; - v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; - v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; - v8i16 res0, res1, res2, res3, res4, res5, res6, res7; - v8i16 res8, res9, res10, res11, res12, res13, res14, res15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - v16i8 zero = { 0 }; - - r0 = LD_SH(input + 0 * 16); - r3 = LD_SH(input + 3 * 16); - r4 = LD_SH(input + 4 * 16); - r7 = LD_SH(input + 7 * 16); - r8 = LD_SH(input + 8 * 16); - r11 = LD_SH(input + 11 * 16); - r12 = LD_SH(input + 12 * 16); - r15 = LD_SH(input + 15 * 16); - - /* stage 1 */ - k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); - k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); - k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); - k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); - MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); - k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); - k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); - k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); - k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); - MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); - BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); - k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); - k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); - MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); - - r1 = LD_SH(input + 1 * 16); - r2 = LD_SH(input + 2 * 16); - r5 = LD_SH(input + 5 * 16); - r6 = LD_SH(input + 6 * 16); - r9 = LD_SH(input + 9 * 16); - r10 = LD_SH(input + 10 * 16); - r13 = LD_SH(input + 13 * 16); - r14 = LD_SH(input + 14 * 16); - - k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); - k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); - k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); - k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); - MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); - k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); - k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); - k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); - k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); - MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); - BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); - BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); - out1 = -out1; - SRARI_H2_SH(out0, out1, 6); - dst0 = LD_UB(dst + 0 * dst_stride); - dst1 = LD_UB(dst + 15 * dst_stride); - ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); - ADD2(res0, out0, res1, out1, res0, res1); - CLIP_SH2_0_255(res0, res1); - PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); - ST8x1_UB(res0, dst); - ST8x1_UB(res1, dst + 15 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); - k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); - k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); - MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); - BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); - out8 = -out8; - - SRARI_H2_SH(out8, out9, 6); - dst8 = LD_UB(dst + 1 * dst_stride); - dst9 = LD_UB(dst + 14 * dst_stride); - ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); - ADD2(res8, out8, res9, out9, res8, res9); - CLIP_SH2_0_255(res8, res9); - PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); - ST8x1_UB(res8, dst + dst_stride); - ST8x1_UB(res9, dst + 14 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); - k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); - MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); - out4 = -out4; - SRARI_H2_SH(out4, out5, 6); - dst4 = LD_UB(dst + 3 * dst_stride); - dst5 = LD_UB(dst + 12 * dst_stride); - ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); - ADD2(res4, out4, res5, out5, res4, res5); - CLIP_SH2_0_255(res4, res5); - PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); - ST8x1_UB(res4, dst + 3 * dst_stride); - ST8x1_UB(res5, dst + 12 * dst_stride); - - MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); - out13 = -out13; - SRARI_H2_SH(out12, out13, 6); - dst12 = LD_UB(dst + 2 * dst_stride); - dst13 = LD_UB(dst + 13 * dst_stride); - ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); - ADD2(res12, out12, res13, out13, res12, res13); - CLIP_SH2_0_255(res12, res13); - PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); - ST8x1_UB(res12, dst + 2 * dst_stride); - ST8x1_UB(res13, dst + 13 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); - k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); - MADD_SHORT(out6, out7, k0, k3, out6, out7); - SRARI_H2_SH(out6, out7, 6); - dst6 = LD_UB(dst + 4 * dst_stride); - dst7 = LD_UB(dst + 11 * dst_stride); - ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); - ADD2(res6, out6, res7, out7, res6, res7); - CLIP_SH2_0_255(res6, res7); - PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); - ST8x1_UB(res6, dst + 4 * dst_stride); - ST8x1_UB(res7, dst + 11 * dst_stride); - - MADD_SHORT(out10, out11, k0, k3, out10, out11); - SRARI_H2_SH(out10, out11, 6); - dst10 = LD_UB(dst + 6 * dst_stride); - dst11 = LD_UB(dst + 9 * dst_stride); - ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); - ADD2(res10, out10, res11, out11, res10, res11); - CLIP_SH2_0_255(res10, res11); - PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); - ST8x1_UB(res10, dst + 6 * dst_stride); - ST8x1_UB(res11, dst + 9 * dst_stride); - - k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); - k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); - MADD_SHORT(h10, h11, k1, k2, out2, out3); - SRARI_H2_SH(out2, out3, 6); - dst2 = LD_UB(dst + 7 * dst_stride); - dst3 = LD_UB(dst + 8 * dst_stride); - ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); - ADD2(res2, out2, res3, out3, res2, res3); - CLIP_SH2_0_255(res2, res3); - PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); - ST8x1_UB(res2, dst + 7 * dst_stride); - ST8x1_UB(res3, dst + 8 * dst_stride); - - MADD_SHORT(out14, out15, k1, k2, out14, out15); - SRARI_H2_SH(out14, out15, 6); - dst14 = LD_UB(dst + 5 * dst_stride); - dst15 = LD_UB(dst + 10 * dst_stride); - ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); - ADD2(res14, out14, res15, out15, res14, res15); - CLIP_SH2_0_255(res14, res15); - PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); - ST8x1_UB(res14, dst + 5 * dst_stride); - ST8x1_UB(res15, dst + 10 * dst_stride); -} diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c deleted file mode 100644 index f1ca757a0..000000000 --- a/third_party/aom/aom_dsp/mips/idct32x32_msa.c +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -static void idct32x8_row_transpose_store(const int16_t *input, - int16_t *tmp_buf) { - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* 1st & 2nd 8x8 */ - LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); - ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); - ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); - - /* 3rd & 4th 8x8 */ - LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); - ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); - ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); - ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); -} - -static void idct32x8_row_even_process_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; - - /* Even stage 1 */ - LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - loc1 = vec3; - loc0 = vec1; - - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); - BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); - BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); - - /* Even stage 2 */ - LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); - - vec0 = reg0 + reg4; - reg0 = reg0 - reg4; - reg4 = reg6 + reg2; - reg6 = reg6 - reg2; - reg2 = reg1 + reg5; - reg1 = reg1 - reg5; - reg5 = reg7 + reg3; - reg7 = reg7 - reg3; - reg3 = vec0; - - vec1 = reg2; - reg2 = reg3 + reg4; - reg3 = reg3 - reg4; - reg4 = reg5 - vec1; - reg5 = reg5 + vec1; - - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); - - vec0 = reg0 - reg6; - reg0 = reg0 + reg6; - vec1 = reg7 - reg1; - reg7 = reg7 + reg1; - - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); - - /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ - BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 15 * 8)); - ST_SH(loc1, (tmp_eve_buf)); - ST_SH(loc2, (tmp_eve_buf + 14 * 8)); - ST_SH(loc3, (tmp_eve_buf + 8)); - - BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 13 * 8)); - ST_SH(loc1, (tmp_eve_buf + 2 * 8)); - ST_SH(loc2, (tmp_eve_buf + 12 * 8)); - ST_SH(loc3, (tmp_eve_buf + 3 * 8)); - - /* Store 8 */ - BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 11 * 8)); - ST_SH(loc1, (tmp_eve_buf + 4 * 8)); - ST_SH(loc2, (tmp_eve_buf + 10 * 8)); - ST_SH(loc3, (tmp_eve_buf + 5 * 8)); - - BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 9 * 8)); - ST_SH(loc1, (tmp_eve_buf + 6 * 8)); - ST_SH(loc2, (tmp_eve_buf + 8 * 8)); - ST_SH(loc3, (tmp_eve_buf + 7 * 8)); -} - -static void idct32x8_row_odd_process_store(int16_t *tmp_buf, - int16_t *tmp_odd_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - - /* Odd stage 1 */ - reg0 = LD_SH(tmp_buf + 8); - reg1 = LD_SH(tmp_buf + 7 * 8); - reg2 = LD_SH(tmp_buf + 9 * 8); - reg3 = LD_SH(tmp_buf + 15 * 8); - reg4 = LD_SH(tmp_buf + 17 * 8); - reg5 = LD_SH(tmp_buf + 23 * 8); - reg6 = LD_SH(tmp_buf + 25 * 8); - reg7 = LD_SH(tmp_buf + 31 * 8); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); - - vec0 = reg0 + reg3; - reg0 = reg0 - reg3; - reg3 = reg7 + reg4; - reg7 = reg7 - reg4; - reg4 = reg1 + reg2; - reg1 = reg1 - reg2; - reg2 = reg6 + reg5; - reg6 = reg6 - reg5; - reg5 = vec0; - - /* 4 Stores */ - ADD2(reg5, reg4, reg3, reg2, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg5, reg4, reg3, reg2, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf), 8); - - /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); - - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); - - /* Odd stage 2 */ - /* 8 loads */ - reg0 = LD_SH(tmp_buf + 3 * 8); - reg1 = LD_SH(tmp_buf + 5 * 8); - reg2 = LD_SH(tmp_buf + 11 * 8); - reg3 = LD_SH(tmp_buf + 13 * 8); - reg4 = LD_SH(tmp_buf + 19 * 8); - reg5 = LD_SH(tmp_buf + 21 * 8); - reg6 = LD_SH(tmp_buf + 27 * 8); - reg7 = LD_SH(tmp_buf + 29 * 8); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); - - /* 4 Stores */ - SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - - BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); - - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); - - /* 4 Stores */ - ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); - BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); - ST_SH(reg0, (tmp_odd_buf + 13 * 8)); - ST_SH(reg1, (tmp_odd_buf + 14 * 8)); - - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); - ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); - - /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ - - /* Load 8 & Store 8 */ - LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); - LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); - - SUB2(reg0, reg4, reg1, reg5, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg2, reg6, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); - - /* Load 8 & Store 8 */ - LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); - LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg0, reg4, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg1, reg5, reg2, reg6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); -} - -static void idct_butterfly_transpose_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, int16_t *dst) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* FINAL BUTTERFLY : Dependency on Even & Odd */ - vec0 = LD_SH(tmp_odd_buf); - vec1 = LD_SH(tmp_odd_buf + 9 * 8); - vec2 = LD_SH(tmp_odd_buf + 14 * 8); - vec3 = LD_SH(tmp_odd_buf + 6 * 8); - loc0 = LD_SH(tmp_eve_buf); - loc1 = LD_SH(tmp_eve_buf + 8 * 8); - loc2 = LD_SH(tmp_eve_buf + 4 * 8); - loc3 = LD_SH(tmp_eve_buf + 12 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); - - ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 4 * 8); - vec1 = LD_SH(tmp_odd_buf + 13 * 8); - vec2 = LD_SH(tmp_odd_buf + 10 * 8); - vec3 = LD_SH(tmp_odd_buf + 3 * 8); - loc0 = LD_SH(tmp_eve_buf + 2 * 8); - loc1 = LD_SH(tmp_eve_buf + 10 * 8); - loc2 = LD_SH(tmp_eve_buf + 6 * 8); - loc3 = LD_SH(tmp_eve_buf + 14 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); - - ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 2 * 8); - vec1 = LD_SH(tmp_odd_buf + 11 * 8); - vec2 = LD_SH(tmp_odd_buf + 12 * 8); - vec3 = LD_SH(tmp_odd_buf + 7 * 8); - loc0 = LD_SH(tmp_eve_buf + 1 * 8); - loc1 = LD_SH(tmp_eve_buf + 9 * 8); - loc2 = LD_SH(tmp_eve_buf + 5 * 8); - loc3 = LD_SH(tmp_eve_buf + 13 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); - - ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 5 * 8); - vec1 = LD_SH(tmp_odd_buf + 15 * 8); - vec2 = LD_SH(tmp_odd_buf + 8 * 8); - vec3 = LD_SH(tmp_odd_buf + 1 * 8); - loc0 = LD_SH(tmp_eve_buf + 3 * 8); - loc1 = LD_SH(tmp_eve_buf + 11 * 8); - loc2 = LD_SH(tmp_eve_buf + 7 * 8); - loc3 = LD_SH(tmp_eve_buf + 15 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); - - ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); - - /* Transpose : 16 vectors */ - /* 1st & 2nd 8x8 */ - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - ST_SH4(m0, n0, m1, n1, (dst + 0), 32); - ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); - - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m4, n4, m5, n5, (dst + 8), 32); - ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); - - /* 3rd & 4th 8x8 */ - LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - ST_SH4(m0, n0, m1, n1, (dst + 16), 32); - ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); - - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m4, n4, m5, n5, (dst + 24), 32); - ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); -} - -static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { - DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); - DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); - DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); - - idct32x8_row_transpose_store(input, &tmp_buf[0]); - idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); - idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); - idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], - output); -} - -static void idct8x32_column_even_process_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; - - /* Even stage 1 */ - LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - tmp_buf += (2 * 32); - - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - loc1 = vec3; - loc0 = vec1; - - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); - BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); - BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); - - /* Even stage 2 */ - /* Load 8 */ - LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); - - vec0 = reg0 + reg4; - reg0 = reg0 - reg4; - reg4 = reg6 + reg2; - reg6 = reg6 - reg2; - reg2 = reg1 + reg5; - reg1 = reg1 - reg5; - reg5 = reg7 + reg3; - reg7 = reg7 - reg3; - reg3 = vec0; - - vec1 = reg2; - reg2 = reg3 + reg4; - reg3 = reg3 - reg4; - reg4 = reg5 - vec1; - reg5 = reg5 + vec1; - - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); - - vec0 = reg0 - reg6; - reg0 = reg0 + reg6; - vec1 = reg7 - reg1; - reg7 = reg7 + reg1; - - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); - - /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ - /* Store 8 */ - BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, tmp_eve_buf, 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); - - BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); - - /* Store 8 */ - BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); - - BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); -} - -static void idct8x32_column_odd_process_store(int16_t *tmp_buf, - int16_t *tmp_odd_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - - /* Odd stage 1 */ - reg0 = LD_SH(tmp_buf + 32); - reg1 = LD_SH(tmp_buf + 7 * 32); - reg2 = LD_SH(tmp_buf + 9 * 32); - reg3 = LD_SH(tmp_buf + 15 * 32); - reg4 = LD_SH(tmp_buf + 17 * 32); - reg5 = LD_SH(tmp_buf + 23 * 32); - reg6 = LD_SH(tmp_buf + 25 * 32); - reg7 = LD_SH(tmp_buf + 31 * 32); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); - - vec0 = reg0 + reg3; - reg0 = reg0 - reg3; - reg3 = reg7 + reg4; - reg7 = reg7 - reg4; - reg4 = reg1 + reg2; - reg1 = reg1 - reg2; - reg2 = reg6 + reg5; - reg6 = reg6 - reg5; - reg5 = vec0; - - /* 4 Stores */ - ADD2(reg5, reg4, reg3, reg2, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); - SUB2(reg5, reg4, reg3, reg2, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - ST_SH2(vec0, vec1, tmp_odd_buf, 8); - - /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); - - /* Odd stage 2 */ - /* 8 loads */ - reg0 = LD_SH(tmp_buf + 3 * 32); - reg1 = LD_SH(tmp_buf + 5 * 32); - reg2 = LD_SH(tmp_buf + 11 * 32); - reg3 = LD_SH(tmp_buf + 13 * 32); - reg4 = LD_SH(tmp_buf + 19 * 32); - reg5 = LD_SH(tmp_buf + 21 * 32); - reg6 = LD_SH(tmp_buf + 27 * 32); - reg7 = LD_SH(tmp_buf + 29 * 32); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); - - /* 4 Stores */ - SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); - ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); - - /* 4 Stores */ - ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); - BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); - ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); - ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); - - /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ - /* Load 8 & Store 8 */ - LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); - LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); - - SUB2(reg0, reg4, reg1, reg5, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg2, reg6, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); - - /* Load 8 & Store 8 */ - LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); - LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg0, reg4, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg1, reg5, reg2, reg6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); -} - -static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, uint8_t *dst, - int32_t dst_stride) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* FINAL BUTTERFLY : Dependency on Even & Odd */ - vec0 = LD_SH(tmp_odd_buf); - vec1 = LD_SH(tmp_odd_buf + 9 * 8); - vec2 = LD_SH(tmp_odd_buf + 14 * 8); - vec3 = LD_SH(tmp_odd_buf + 6 * 8); - loc0 = LD_SH(tmp_eve_buf); - loc1 = LD_SH(tmp_eve_buf + 8 * 8); - loc2 = LD_SH(tmp_eve_buf + 4 * 8); - loc3 = LD_SH(tmp_eve_buf + 12 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); - SRARI_H4_SH(m0, m2, m4, m6, 6); - AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); - SRARI_H4_SH(m0, m2, m4, m6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, - m6); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 4 * 8); - vec1 = LD_SH(tmp_odd_buf + 13 * 8); - vec2 = LD_SH(tmp_odd_buf + 10 * 8); - vec3 = LD_SH(tmp_odd_buf + 3 * 8); - loc0 = LD_SH(tmp_eve_buf + 2 * 8); - loc1 = LD_SH(tmp_eve_buf + 10 * 8); - loc2 = LD_SH(tmp_eve_buf + 6 * 8); - loc3 = LD_SH(tmp_eve_buf + 14 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); - SRARI_H4_SH(m1, m3, m5, m7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); - SRARI_H4_SH(m1, m3, m5, m7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, - m7); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 2 * 8); - vec1 = LD_SH(tmp_odd_buf + 11 * 8); - vec2 = LD_SH(tmp_odd_buf + 12 * 8); - vec3 = LD_SH(tmp_odd_buf + 7 * 8); - loc0 = LD_SH(tmp_eve_buf + 1 * 8); - loc1 = LD_SH(tmp_eve_buf + 9 * 8); - loc2 = LD_SH(tmp_eve_buf + 5 * 8); - loc3 = LD_SH(tmp_eve_buf + 13 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); - SRARI_H4_SH(n0, n2, n4, n6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); - SRARI_H4_SH(n0, n2, n4, n6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, - n6); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 5 * 8); - vec1 = LD_SH(tmp_odd_buf + 15 * 8); - vec2 = LD_SH(tmp_odd_buf + 8 * 8); - vec3 = LD_SH(tmp_odd_buf + 1 * 8); - loc0 = LD_SH(tmp_eve_buf + 3 * 8); - loc1 = LD_SH(tmp_eve_buf + 11 * 8); - loc2 = LD_SH(tmp_eve_buf + 7 * 8); - loc3 = LD_SH(tmp_eve_buf + 15 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); - SRARI_H4_SH(n1, n3, n5, n7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); - SRARI_H4_SH(n1, n3, n5, n7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, - n7); -} - -static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); - DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); - - idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); - idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); - idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, - dst_stride); -} - -void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); - int16_t *out_ptr = out_arr; - - /* transform rows */ - for (i = 0; i < 4; ++i) { - /* process 32 * 8 block */ - idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); - } - - /* transform columns */ - for (i = 0; i < 4; ++i) { - /* process 8 * 32 block */ - idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); - int16_t *out_ptr = out_arr; - - for (i = 32; i--;) { - __asm__ __volatile__( - "sw $zero, 0(%[out_ptr]) \n\t" - "sw $zero, 4(%[out_ptr]) \n\t" - "sw $zero, 8(%[out_ptr]) \n\t" - "sw $zero, 12(%[out_ptr]) \n\t" - "sw $zero, 16(%[out_ptr]) \n\t" - "sw $zero, 20(%[out_ptr]) \n\t" - "sw $zero, 24(%[out_ptr]) \n\t" - "sw $zero, 28(%[out_ptr]) \n\t" - "sw $zero, 32(%[out_ptr]) \n\t" - "sw $zero, 36(%[out_ptr]) \n\t" - "sw $zero, 40(%[out_ptr]) \n\t" - "sw $zero, 44(%[out_ptr]) \n\t" - "sw $zero, 48(%[out_ptr]) \n\t" - "sw $zero, 52(%[out_ptr]) \n\t" - "sw $zero, 56(%[out_ptr]) \n\t" - "sw $zero, 60(%[out_ptr]) \n\t" - - : - : [out_ptr] "r"(out_ptr)); - - out_ptr += 32; - } - - out_ptr = out_arr; - - /* rows: only upper-left 8x8 has non-zero coeff */ - idct32x8_1d_rows_msa(input, out_ptr); - - /* transform columns */ - for (i = 0; i < 4; ++i) { - /* process 8 * 32 block */ - idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - int16_t out; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 6); - - vec = __msa_fill_h(out); - - for (i = 16; i--;) { - LD_UB2(dst, 16, dst0, dst1); - LD_UB2(dst + dst_stride, 16, dst2, dst3); - - UNPCK_UB_SH(dst0, res0, res4); - UNPCK_UB_SH(dst1, res1, res5); - UNPCK_UB_SH(dst2, res2, res6); - UNPCK_UB_SH(dst3, res3, res7); - ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); - ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); - CLIP_SH4_0_255(res0, res1, res2, res3); - CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, - tmp2, tmp3); - - ST_UB2(tmp0, tmp1, dst, 16); - dst += dst_stride; - ST_UB2(tmp2, tmp3, dst, 16); - dst += dst_stride; - } -} diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c deleted file mode 100644 index 274818baa..000000000 --- a/third_party/aom/aom_dsp/mips/idct4x4_msa.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3; - v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; - - /* load vector elements of 4x4 block */ - LD4x4_SH(input, in0, in2, in3, in1); - TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); - UNPCK_R_SH_SW(in0, in0_r); - UNPCK_R_SH_SW(in2, in2_r); - UNPCK_R_SH_SW(in3, in3_r); - UNPCK_R_SH_SW(in1, in1_r); - SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); - - in0_r += in2_r; - in3_r -= in1_r; - in4_r = (in0_r - in3_r) >> 1; - in1_r = in4_r - in1_r; - in2_r = in4_r - in2_r; - in0_r -= in1_r; - in3_r += in2_r; - - TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); - - in0_r += in1_r; - in2_r -= in3_r; - in4_r = (in0_r - in2_r) >> 1; - in3_r = in4_r - in3_r; - in1_r = in4_r - in1_r; - in0_r -= in3_r; - in2_r += in1_r; - - PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, - in2, in3); - ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); -} - -void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t a1, e1; - v8i16 in1, in0 = { 0 }; - - a1 = input[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - - in0 = __msa_insert_h(in0, 0, a1); - in0 = __msa_insert_h(in0, 1, e1); - in0 = __msa_insert_h(in0, 2, e1); - in0 = __msa_insert_h(in0, 3, e1); - - in1 = in0 >> 1; - in0 -= in1; - - ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); -} - -void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3; - - /* load vector elements of 4x4 block */ - LD4x4_SH(input, in0, in1, in2, in3); - /* rows */ - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); - /* columns */ - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); - /* rounding (add 2^3, divide by 2^4) */ - SRARI_H4_SH(in0, in1, in2, in3, 4); - ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); -} - -void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t out; - v8i16 vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 4); - vec = __msa_fill_h(out); - - ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); -} diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c deleted file mode 100644 index 981c103cd..000000000 --- a/third_party/aom/aom_dsp/mips/idct8x8_msa.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - - /* load vector elements of 8x8 block */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - - /* rows transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* 1D idct8x8 */ - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* columns transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* 1D idct8x8 */ - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* final rounding (add 2^4, divide by 2^5) and shift */ - SRARI_H4_SH(in0, in1, in2, in3, 5); - SRARI_H4_SH(in4, in5, in6, in7, 5); - /* add block and store 8x8 */ - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); -} - -void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; - v4i32 tmp0, tmp1, tmp2, tmp3; - v8i16 zero = { 0 }; - - /* load vector elements of 8x8 block */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - - /* stage1 */ - ILVL_H2_SH(in3, in0, in2, in1, s0, s1); - k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); - k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); - k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); - DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); - SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); - PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); - BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); - - /* stage2 */ - ILVR_H2_SH(in3, in1, in2, in0, s1, s0); - k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); - k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); - k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); - k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); - DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); - SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); - PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); - BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); - - /* stage3 */ - s0 = __msa_ilvr_h(s6, s5); - - k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); - DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); - SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); - - /* stage4 */ - BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, - in7); - TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - /* final rounding (add 2^4, divide by 2^5) and shift */ - SRARI_H4_SH(in0, in1, in2, in3, 5); - SRARI_H4_SH(in4, in5, in6, in7, 5); - - /* add block and store 8x8 */ - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); -} - -void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t out; - int32_t val; - v8i16 vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - val = ROUND_POWER_OF_TWO(out, 5); - vec = __msa_fill_h(val); - - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); -} diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c index bcb9c9df9..9f25cc1ca 100644 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h deleted file mode 100644 index c69835173..000000000 --- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ -#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ - -#include <assert.h> - -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/mips/common_dspr2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* Note: this macro expects a local int32_t named out to exist, and will write - * to that variable. */ -#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ - ({ \ - \ - int32_t tmp; \ - int dct_cost_rounding = DCT_CONST_ROUNDING; \ - int in = input; \ - \ - __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac1 " \ - " \n\t" \ - "mthi $zero, $ac1 " \ - " \n\t" \ - "madd $ac1, %[in], " \ - "%[cospi_16_64] \n\t" \ - "extp %[tmp], $ac1, " \ - "31 \n\t" \ - \ - /* out = dct_const_round_shift(out * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac2 " \ - " \n\t" \ - "mthi $zero, $ac2 " \ - " \n\t" \ - "madd $ac2, %[tmp], " \ - "%[cospi_16_64] \n\t" \ - "extp %[out], $ac2, " \ - "31 \n\t" \ - \ - : [tmp] "=&r"(tmp), [out] "=r"(out) \ - : [in] "r"(in), \ - [dct_cost_rounding] "r"(dct_cost_rounding), \ - [cospi_16_64] "r"(cospi_16_64)); \ - out; \ - }) - -void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output); -void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void iadst4_dspr2(const int16_t *input, int16_t *output); -void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void iadst8_dspr2(const int16_t *input, int16_t *output); -void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); -void iadst16_dspr2(const int16_t *input, int16_t *output); - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h deleted file mode 100644 index 122667aa8..000000000 --- a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ -#define AOM_DSP_MIPS_INV_TXFM_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/mips/txfm_macros_msa.h" -#include "aom_dsp/txfm_common.h" - -#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ - v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ - cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ - v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ - cospi_24_64, -cospi_24_64, 0, 0 }; \ - \ - SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ - ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in7, in0, in4, in3); \ - \ - SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in5, in2, in6, in1); \ - BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ - out7 = -s0_m; \ - out0 = s1_m; \ - \ - SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ - \ - ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - cnst1_m = cnst0_m; \ - \ - ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ - cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ - \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - out1 = -out1; \ - out3 = -out3; \ - out5 = -out5; \ - } - -#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ - ({ \ - v8i16 out0_m, r0_m, r1_m; \ - \ - r0_m = __msa_fill_h(c0_h); \ - r1_m = __msa_fill_h(c1_h); \ - out0_m = __msa_ilvev_h(r1_m, r0_m); \ - \ - out0_m; \ - }) - -#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ - { \ - uint8_t *dst_m = (uint8_t *)(dst); \ - v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ - v16i8 tmp0_m, tmp1_m; \ - v16i8 zero_m = { 0 }; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ - ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ - res0_m, res1_m, res2_m, res3_m); \ - ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ - res2_m, res3_m); \ - CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ - PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ - } - -#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 c0_m, c1_m, c2_m, c3_m; \ - v8i16 step0_m, step1_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - step0_m = __msa_ilvr_h(in2, in0); \ - DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - \ - c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - step1_m = __msa_ilvr_h(in3, in1); \ - DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - \ - PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ - SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ - BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ - out0, out1, out2, out3); \ - } - -#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 res0_m, res1_m, c0_m, c1_m; \ - v8i16 k1_m, k2_m, k3_m, k4_m; \ - v8i16 zero_m = { 0 }; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v4i32 int0_m, int1_m, int2_m, int3_m; \ - v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ - -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ - \ - SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ - ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ - int0_m = tmp2_m + tmp1_m; \ - \ - SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ - ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int1_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int2_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - c0_m = __msa_ilvev_h(c0_m, k1_m); \ - \ - res0_m = __msa_ilvr_h((in1), (in3)); \ - tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ - int3_m = tmp2_m + tmp0_m; \ - \ - res0_m = __msa_ilvr_h((in2), (in3)); \ - c1_m = __msa_ilvev_h(k4_m, k3_m); \ - \ - tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ - res1_m = __msa_ilvr_h((in0), (in2)); \ - c1_m = __msa_ilvev_h(k1_m, zero_m); \ - \ - tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ - int3_m += tmp2_m; \ - int3_m += tmp3_m; \ - \ - SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ - PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ - } - -#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ - ({ \ - v8i16 c0_m, c1_m; \ - \ - SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ - c0_m = __msa_ilvev_h(c1_m, c0_m); \ - \ - c0_m; \ - }) - -/* multiply and add macro */ -#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ - out2, out3) \ - { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ - DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ - cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ - SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ - DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ - cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ - SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ - } - -/* idct 8x8 macro */ -#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ - cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ - \ - k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ - k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ - k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ - k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ - AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ - SUB2(in1, in3, in7, in5, res0_m, res1_m); \ - k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ - k1_m = __msa_splati_h(mask_m, 4); \ - \ - ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ - DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - tp4_m = in1 + in3; \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ - tp7_m = in7 + in5; \ - k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ - BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ - BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ - out1, out2, out3, out4, out5, out6, out7); \ - } - -#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ - v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ - v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ - cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ - v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ - -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ - v8i16 mask3_m = { \ - -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ - }; \ - \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ - k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ - ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ - ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ - r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ - k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ - ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ - ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ - r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ - ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ - BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ - k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ - ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ - r6_m, r7_m); \ - ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ - SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ - k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ - k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ - ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ - m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ - ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ - m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ - \ - out1 = -in1; \ - out3 = -in3; \ - out5 = -in5; \ - out7 = -in7; \ - } - -#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ - r12, r13, r14, r15, out0, out1, out2, out3, out4, \ - out5, out6, out7, out8, out9, out10, out11, out12, \ - out13, out14, out15) \ - { \ - v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ - v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ - v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ - v8i16 h8_m, h9_m, h10_m, h11_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m; \ - \ - /* stage 1 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ - MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ - MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ - MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ - g11_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ - MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ - g15_m); \ - \ - /* stage 2 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ - k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ - MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ - h3_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ - k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ - MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ - h6_m, h7_m); \ - BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ - BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ - h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ - \ - /* stage 3 */ \ - BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ - MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ - out7); \ - MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ - out13, out15); \ - \ - /* stage 4 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ - MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ - MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ - MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ - MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ - } - -void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride); -void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); -void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride); -void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); -#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c deleted file mode 100644 index c63b1e857..000000000 --- a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c +++ /dev/null @@ -1,1190 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void idct16_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_10, step1_11, step1_12, step1_13; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - - for (i = no_rows; i--;) { - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 16)); - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), - [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), - [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_8] "=r"(step2_8), - [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), - [step2_14] "=r"(step2_14) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), - [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), - [step1_13] "=r"(step1_13) - : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), - [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), - [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), - [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step2_12] \n\t" - "add %[load5], %[load5], %[step2_15] \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step2_13] \n\t" - "add %[load6], %[load6], %[step2_14] \n\t" - "sh %[load5], 0(%[output]) \n\t" - "sh %[load6], 32(%[output]) \n\t" - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "add %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - "add %[load6], %[load6], %[step2_11] \n\t" - "sh %[load5], 192(%[output]) \n\t" - "sh %[load6], 224(%[output]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "sub %[load5], %[load5], %[step2_11] \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step2_9] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "sh %[load5], 256(%[output]) \n\t" - "sh %[load6], 288(%[output]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_14] \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_15] \n\t" - "sh %[load5], 448(%[output]) \n\t" - "sh %[load6], 480(%[output]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6) - : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), - [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), - [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), - [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), - [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); - - __asm__ __volatile__( - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sh %[load5], 64(%[output]) \n\t" - "sh %[load6], 96(%[output]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sh %[load5], 128(%[output]) \n\t" - "sh %[load6], 160(%[output]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sh %[load5], 320(%[output]) \n\t" - "sh %[load6], 352(%[output]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sh %[load5], 384(%[output]) \n\t" - "sh %[load6], 416(%[output]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6) - : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), - [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), - [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); - - input += 16; - output += 1; - } -} - -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_8, step1_9, step1_10, step1_11; - int step1_12, step1_13, step1_14, step1_15; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 16; ++i) { - dest_pix = (dest + i); - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), - [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), - [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_8] "=r"(step2_8), - [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), - [step2_14] "=r"(step2_14) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), - [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), - [step1_13] "=r"(step1_13) - : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), - [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), - [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), - [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); - - step1_8 = step2_8 + step2_11; - step1_9 = step2_9 + step2_10; - step1_14 = step2_13 + step2_14; - step1_15 = step2_12 + step2_15; - - __asm__ __volatile__( - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step1_15] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step1_14] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[load5], %[step1_9] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step1_8] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step1_8] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step1_9] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step1_14] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step1_15] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) - : - [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), - [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), - [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), - [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), - [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); - - input += 16; - } -} - -void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct16_rows_dspr2(input, out, 16); - - // Then transform columns and add to dest - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_rows_dspr2(input, outptr, 4); - - outptr += 4; - for (i = 0; i < 6; ++i) { - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 128(%[outptr]) \n\t" - "sw $zero, 160(%[outptr]) \n\t" - "sw $zero, 192(%[outptr]) \n\t" - "sw $zero, 224(%[outptr]) \n\t" - "sw $zero, 256(%[outptr]) \n\t" - "sw $zero, 288(%[outptr]) \n\t" - "sw $zero, 320(%[outptr]) \n\t" - "sw $zero, 352(%[outptr]) \n\t" - "sw $zero, 384(%[outptr]) \n\t" - "sw $zero, 416(%[outptr]) \n\t" - "sw $zero, 448(%[outptr]) \n\t" - "sw $zero, 480(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - outptr += 2; - } - - // Then transform columns - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 16; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 16; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst16_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - - int x0 = input[15]; - int x1 = input[0]; - int x2 = input[13]; - int x3 = input[2]; - int x4 = input[11]; - int x5 = input[4]; - int x6 = input[9]; - int x7 = input[6]; - int x8 = input[7]; - int x9 = input[8]; - int x10 = input[5]; - int x11 = input[10]; - int x12 = input[3]; - int x13 = input[12]; - int x14 = input[1]; - int x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = dct_const_round_shift(s0 + s8); - x1 = dct_const_round_shift(s1 + s9); - x2 = dct_const_round_shift(s2 + s10); - x3 = dct_const_round_shift(s3 + s11); - x4 = dct_const_round_shift(s4 + s12); - x5 = dct_const_round_shift(s5 + s13); - x6 = dct_const_round_shift(s6 + s14); - x7 = dct_const_round_shift(s7 + s15); - x8 = dct_const_round_shift(s0 - s8); - x9 = dct_const_round_shift(s1 - s9); - x10 = dct_const_round_shift(s2 - s10); - x11 = dct_const_round_shift(s3 - s11); - x12 = dct_const_round_shift(s4 - s12); - x13 = dct_const_round_shift(s5 - s13); - x14 = dct_const_round_shift(s6 - s14); - x15 = dct_const_round_shift(s7 - s15); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = s0 - s4; - x5 = s1 - s5; - x6 = s2 - s6; - x7 = s3 - s7; - x8 = dct_const_round_shift(s8 + s12); - x9 = dct_const_round_shift(s9 + s13); - x10 = dct_const_round_shift(s10 + s14); - x11 = dct_const_round_shift(s11 + s15); - x12 = dct_const_round_shift(s8 - s12); - x13 = dct_const_round_shift(s9 - s13); - x14 = dct_const_round_shift(s10 - s14); - x15 = dct_const_round_shift(s11 - s15); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = dct_const_round_shift(s4 + s6); - x5 = dct_const_round_shift(s5 + s7); - x6 = dct_const_round_shift(s4 - s6); - x7 = dct_const_round_shift(s5 - s7); - x8 = s8 + s10; - x9 = s9 + s11; - x10 = s8 - s10; - x11 = s9 - s11; - x12 = dct_const_round_shift(s12 + s14); - x13 = dct_const_round_shift(s13 + s15); - x14 = dct_const_round_shift(s12 - s14); - x15 = dct_const_round_shift(s13 - s15); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = dct_const_round_shift(s2); - x3 = dct_const_round_shift(s3); - x6 = dct_const_round_shift(s6); - x7 = dct_const_round_shift(s7); - x10 = dct_const_round_shift(s10); - x11 = dct_const_round_shift(s11); - x14 = dct_const_round_shift(s14); - x15 = dct_const_round_shift(s15); - - output[0] = x0; - output[1] = -x8; - output[2] = x12; - output[3] = -x4; - output[4] = x6; - output[5] = x14; - output[6] = x10; - output[7] = x2; - output[8] = x3; - output[9] = x11; - output[10] = x15; - output[11] = x7; - output[12] = x5; - output[13] = -x13; - output[14] = x9; - output[15] = -x1; -} - -#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c deleted file mode 100644 index d469d1ad0..000000000 --- a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c +++ /dev/null @@ -1,1042 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; - int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; - int16_t step1_27, step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; - int16_t step3_28, step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int i, temp21; - uint8_t *dest_pix, *dest_pix1; - const int const_2_power_13 = 8192; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 32; ++i) { - dest_pix = dest + i; - dest_pix1 = dest + i + 31 * dest_stride; - - __asm__ __volatile__( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), - [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), - [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), - [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), - [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) - : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), - [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), - [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), - [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), - [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - // stage 7 - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_0], %[step2_31] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_1], %[step2_30] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_2], %[step2_29] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_3], %[step2_28] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), - [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), - [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), - [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), - [step2_31] "r"(step2_31)); - - step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_4], %[step1_27] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_5], %[step1_26] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_6], %[step1_25] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_7], %[step1_24] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), - [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), - [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), - [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), - [step1_27] "r"(step1_27)); - - step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_8], %[step1_23] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_9], %[step1_22] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_10], %[step1_21] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_11], %[step1_20] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), - [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), - [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), - [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), - [step1_23] "r"(step1_23)); - - step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_12], %[step2_19] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_13], %[step2_18] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_14], %[step2_17] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_15], %[step2_16] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), - [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), - [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); - - step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - input += 32; - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c deleted file mode 100644 index fa7703217..000000000 --- a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c +++ /dev/null @@ -1,1030 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "./aom_config.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -static void idct32_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; - int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; - int16_t step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; - int16_t step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int temp21; - int i; - const int const_2_power_13 = 8192; - const int32_t *input_int; - - for (i = no_rows; i--;) { - input_int = (const int32_t *)input; - - if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | - input_int[4] | input_int[5] | input_int[6] | input_int[7] | - input_int[8] | input_int[9] | input_int[10] | input_int[11] | - input_int[12] | input_int[13] | input_int[14] | input_int[15])) { - input += 32; - - __asm__ __volatile__( - "sh $zero, 0(%[output]) \n\t" - "sh $zero, 64(%[output]) \n\t" - "sh $zero, 128(%[output]) \n\t" - "sh $zero, 192(%[output]) \n\t" - "sh $zero, 256(%[output]) \n\t" - "sh $zero, 320(%[output]) \n\t" - "sh $zero, 384(%[output]) \n\t" - "sh $zero, 448(%[output]) \n\t" - "sh $zero, 512(%[output]) \n\t" - "sh $zero, 576(%[output]) \n\t" - "sh $zero, 640(%[output]) \n\t" - "sh $zero, 704(%[output]) \n\t" - "sh $zero, 768(%[output]) \n\t" - "sh $zero, 832(%[output]) \n\t" - "sh $zero, 896(%[output]) \n\t" - "sh $zero, 960(%[output]) \n\t" - "sh $zero, 1024(%[output]) \n\t" - "sh $zero, 1088(%[output]) \n\t" - "sh $zero, 1152(%[output]) \n\t" - "sh $zero, 1216(%[output]) \n\t" - "sh $zero, 1280(%[output]) \n\t" - "sh $zero, 1344(%[output]) \n\t" - "sh $zero, 1408(%[output]) \n\t" - "sh $zero, 1472(%[output]) \n\t" - "sh $zero, 1536(%[output]) \n\t" - "sh $zero, 1600(%[output]) \n\t" - "sh $zero, 1664(%[output]) \n\t" - "sh $zero, 1728(%[output]) \n\t" - "sh $zero, 1792(%[output]) \n\t" - "sh $zero, 1856(%[output]) \n\t" - "sh $zero, 1920(%[output]) \n\t" - "sh $zero, 1984(%[output]) \n\t" - - : - : [output] "r"(output)); - - output += 1; - - continue; - } - - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 32)); - prefetch_load((const uint8_t *)(input + 48)); - - __asm__ __volatile__( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), - [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), - [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), - [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), - [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) - : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), - [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), - [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), - [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), - [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64) - - ); - - __asm__ __volatile__( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - // final stage - output[0 * 32] = step1_0 + step2_31; - output[1 * 32] = step1_1 + step2_30; - output[2 * 32] = step1_2 + step2_29; - output[3 * 32] = step1_3 + step2_28; - output[4 * 32] = step1_4 + step1_27; - output[5 * 32] = step1_5 + step1_26; - output[6 * 32] = step1_6 + step1_25; - output[7 * 32] = step1_7 + step1_24; - output[8 * 32] = step1_8 + step1_23; - output[9 * 32] = step1_9 + step1_22; - output[10 * 32] = step1_10 + step1_21; - output[11 * 32] = step1_11 + step1_20; - output[12 * 32] = step1_12 + step2_19; - output[13 * 32] = step1_13 + step2_18; - output[14 * 32] = step1_14 + step2_17; - output[15 * 32] = step1_15 + step2_16; - output[16 * 32] = step1_15 - step2_16; - output[17 * 32] = step1_14 - step2_17; - output[18 * 32] = step1_13 - step2_18; - output[19 * 32] = step1_12 - step2_19; - output[20 * 32] = step1_11 - step1_20; - output[21 * 32] = step1_10 - step1_21; - output[22 * 32] = step1_9 - step1_22; - output[23 * 32] = step1_8 - step1_23; - output[24 * 32] = step1_7 - step1_24; - output[25 * 32] = step1_6 - step1_25; - output[26 * 32] = step1_5 - step1_26; - output[27 * 32] = step1_4 - step1_27; - output[28 * 32] = step1_3 - step2_28; - output[29 * 32] = step1_2 - step2_29; - output[30 * 32] = step1_1 - step2_30; - output[31 * 32] = step1_0 - step2_31; - - input += 32; - output += 1; - } -} - -void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - idct32_rows_dspr2(input, outptr, 32); - - // Columns - aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - idct32_rows_dspr2(input, outptr, 8); - - outptr += 8; - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - for (i = 0; i < 31; ++i) { - outptr += 32; - - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - } - - // Columns - aom_idct32_cols_add_blk_dspr2(out, dest, stride); -} - -void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - int r, out; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 32; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 32; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c deleted file mode 100644 index e6d0367cd..000000000 --- a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - - for (i = 4; i--;) { - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - - "add %[Temp1], %[step_1], %[step_2] \n\t" - "sh %[Temp1], 8(%[output]) \n\t" - - "sub %[Temp2], %[step_1], %[step_2] \n\t" - "sh %[Temp2], 16(%[output]) \n\t" - - "sub %[Temp3], %[step_0], %[step_3] \n\t" - "sh %[Temp3], 24(%[output]) \n\t" - - : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), - [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); - - input += 4; - output += 1; - } -} - -void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 4; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_0], %[step_3] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), - [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), - [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); - - input += 4; - } -} - -void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[4 * 4]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - aom_idct4_rows_dspr2(input, outptr); - - // Columns - aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - int a1, absa1; - int r; - int32_t out; - int t2, vector_a1, vector_a; - uint32_t pos = 45; - int16_t input_dc = input[0]; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); - __asm__ __volatile__( - "addi %[out], %[out], 8 \n\t" - "sra %[a1], %[out], 4 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 4; r--;) { - __asm__ __volatile__( - "lw %[t2], 0(%[dest]) \n\t" - "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 4; r--;) { - __asm__ __volatile__( - "lw %[t2], 0(%[dest]) \n\t" - "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst4_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3; - - x0 = input[0]; - x1 = input[1]; - x2 = input[2]; - x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = x0 - x2 + x3; - - x0 = s0 + s3 + s5; - x1 = s1 - s4 - s6; - x2 = sinpi_3_9 * s7; - x3 = s2; - - s0 = x0 + x3; - s1 = x1 + x3; - s2 = x2; - s3 = x0 + x1 - x3; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = dct_const_round_shift(s0); - output[1] = dct_const_round_shift(s1); - output[2] = dct_const_round_shift(s2); - output[3] = dct_const_round_shift(s3); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c deleted file mode 100644 index 0a20f76f2..000000000 --- a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - const int const_2_power_13 = 8192; - int Temp0, Temp1, Temp2, Temp3, Temp4; - int i; - - for (i = no_rows; i--;) { - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[Temp4], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[Temp4], %[Temp1] \n\t" - "sub %[step1_3], %[Temp4], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - "add %[Temp1], %[step1_1], %[step1_6] \n\t" - "sh %[Temp1], 16(%[output]) \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "sh %[Temp0], 32(%[output]) \n\t" - "add %[Temp1], %[step1_3], %[step1_4] \n\t" - "sh %[Temp1], 48(%[output]) \n\t" - - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "sh %[Temp0], 64(%[output]) \n\t" - "sub %[Temp1], %[step1_2], %[step1_5] \n\t" - "sh %[Temp1], 80(%[output]) \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "sh %[Temp0], 96(%[output]) \n\t" - "sub %[Temp1], %[step1_0], %[step1_7] \n\t" - "sh %[Temp1], 112(%[output]) \n\t" - - : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), - [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), - [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), - [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), - [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), - [input] "r"(input)); - - input += 8; - output += 1; - } -} - -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int Temp0, Temp1, Temp2, Temp3; - int i; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 8; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[step1_6], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[step1_6], %[Temp1] \n\t" - "sub %[step1_3], %[step1_6], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* add block */ - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_0], %[step1_7] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), - [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), - [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), - [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), - [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); - - input += 8; - } -} - -void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct8_rows_dspr2(input, outptr, 8); - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct8_rows_dspr2(input, outptr, 4); - - outptr += 4; - - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 48(%[outptr]) \n\t" - "sw $zero, 52(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 68(%[outptr]) \n\t" - "sw $zero, 80(%[outptr]) \n\t" - "sw $zero, 84(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 100(%[outptr]) \n\t" - "sw $zero, 112(%[outptr]) \n\t" - "sw $zero, 116(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t t1, t2, vector_a1, vector_1, vector_2; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 16 \n\t" - "sra %[a1], %[out], 5 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 8; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), - [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 8; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), - [vector_2] "=&r"(vector_2), [dest] "+r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst8_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3, x4, x5, x6, x7; - - x0 = input[7]; - x1 = input[0]; - x2 = input[5]; - x3 = input[2]; - x4 = input[3]; - x5 = input[4]; - x6 = input[1]; - x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); - x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); - x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); - x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); - - output[0] = x0; - output[1] = -x4; - output[2] = x6; - output[3] = -x2; - output[4] = x3; - output[5] = -x7; - output[6] = x5; - output[7] = -x1; -} -#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c index fc0c32ce3..38a10e9b2 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c @@ -404,10 +404,11 @@ void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { } } -void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, int32_t count) { +static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); uint8_t early_exit = 0; @@ -639,19 +640,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, } } } else { - aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, - thresh_ptr, count); + mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, + count); } } -void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); } -void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c index 883d0523d..8c41278be 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -11,7 +11,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h index 72df09823..3e38ef3fb 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -14,7 +14,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h index 3e6994714..cb599cf2e 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -14,7 +14,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h index 8db3e521f..6db1dac08 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -14,7 +14,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c index a3b5a9eb1..b67ccfe9d 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -11,7 +11,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c index 8d2fd69f7..34733e42e 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -11,7 +11,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" @@ -718,14 +719,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, } } -void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); } -void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, +void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c index 28528869b..3d3f1ec97 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -11,7 +11,8 @@ #include <stdlib.h> -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h index 48fbcfd47..eb919d42b 100644 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -14,7 +14,8 @@ #include <msa.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c index 258eb5c07..58cdd80d9 100644 --- a/third_party/aom/aom_dsp/mips/sad_msa.c +++ b/third_party/aom/aom_dsp/mips/sad_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ @@ -160,640 +161,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, return sad; } -static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - INSERT_W4_UB(src0, src1, src2, src3, src); - - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref, ref0, ref1, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v4u32 sad; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); - ref0_4 = LD_UB(ref + 64); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32((v4i32)sad); -} - -static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, diff; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref0, ref1, ref; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1; - v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - const uint8_t *src_dup, *ref_dup; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - v4u32 sad; - - src_dup = src; - ref_dup = ref; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[3] = HADD_SW_S32(sad); - - sad0_0 = (v8u16)__msa_ldi_h(0); - sad0_1 = (v8u16)__msa_ldi_h(0); - sad1_0 = (v8u16)__msa_ldi_h(0); - sad1_1 = (v8u16)__msa_ldi_h(0); - sad2_0 = (v8u16)__msa_ldi_h(0); - sad2_1 = (v8u16)__msa_ldi_h(0); - sad3_0 = (v8u16)__msa_ldi_h(0); - sad3_1 = (v8u16)__msa_ldi_h(0); - - for (ht_cnt = 64; ht_cnt--;) { - LD_UB4(src_dup, 16, src0, src1, src2, src3); - src_dup += src_stride; - LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref_dup += ref_stride; - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[4] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[5] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[6] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[7] = HADD_SW_S32(sad); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1290,76 +657,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ } -#define AOM_SAD_4xHEIGHTx3_MSA(height) \ - void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx3_MSA(height) \ - void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx3_MSA(height) \ - void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx3_MSA(height) \ - void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx3_MSA(height) \ - void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_4xHEIGHTx8_MSA(height) \ - void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx8_MSA(height) \ - void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx8_MSA(height) \ - void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx8_MSA(height) \ - void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx8_MSA(height) \ - void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define AOM_SAD_4xHEIGHTx4D_MSA(height) \ void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -1438,92 +735,66 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, /* clang-format off */ // 64x64 AOM_SAD_64xHEIGHT_MSA(64) -AOM_SAD_64xHEIGHTx3_MSA(64) -AOM_SAD_64xHEIGHTx8_MSA(64) AOM_SAD_64xHEIGHTx4D_MSA(64) AOM_AVGSAD_64xHEIGHT_MSA(64) // 64x32 AOM_SAD_64xHEIGHT_MSA(32) -AOM_SAD_64xHEIGHTx3_MSA(32) -AOM_SAD_64xHEIGHTx8_MSA(32) AOM_SAD_64xHEIGHTx4D_MSA(32) AOM_AVGSAD_64xHEIGHT_MSA(32) // 32x64 AOM_SAD_32xHEIGHT_MSA(64) -AOM_SAD_32xHEIGHTx3_MSA(64) -AOM_SAD_32xHEIGHTx8_MSA(64) AOM_SAD_32xHEIGHTx4D_MSA(64) AOM_AVGSAD_32xHEIGHT_MSA(64) // 32x32 AOM_SAD_32xHEIGHT_MSA(32) -AOM_SAD_32xHEIGHTx3_MSA(32) -AOM_SAD_32xHEIGHTx8_MSA(32) AOM_SAD_32xHEIGHTx4D_MSA(32) AOM_AVGSAD_32xHEIGHT_MSA(32) // 32x16 AOM_SAD_32xHEIGHT_MSA(16) -AOM_SAD_32xHEIGHTx3_MSA(16) -AOM_SAD_32xHEIGHTx8_MSA(16) AOM_SAD_32xHEIGHTx4D_MSA(16) AOM_AVGSAD_32xHEIGHT_MSA(16) // 16x32 AOM_SAD_16xHEIGHT_MSA(32) -AOM_SAD_16xHEIGHTx3_MSA(32) -AOM_SAD_16xHEIGHTx8_MSA(32) AOM_SAD_16xHEIGHTx4D_MSA(32) AOM_AVGSAD_16xHEIGHT_MSA(32) // 16x16 AOM_SAD_16xHEIGHT_MSA(16) -AOM_SAD_16xHEIGHTx3_MSA(16) -AOM_SAD_16xHEIGHTx8_MSA(16) AOM_SAD_16xHEIGHTx4D_MSA(16) AOM_AVGSAD_16xHEIGHT_MSA(16) // 16x8 AOM_SAD_16xHEIGHT_MSA(8) -AOM_SAD_16xHEIGHTx3_MSA(8) -AOM_SAD_16xHEIGHTx8_MSA(8) AOM_SAD_16xHEIGHTx4D_MSA(8) AOM_AVGSAD_16xHEIGHT_MSA(8) // 8x16 AOM_SAD_8xHEIGHT_MSA(16) -AOM_SAD_8xHEIGHTx3_MSA(16) -AOM_SAD_8xHEIGHTx8_MSA(16) AOM_SAD_8xHEIGHTx4D_MSA(16) AOM_AVGSAD_8xHEIGHT_MSA(16) // 8x8 AOM_SAD_8xHEIGHT_MSA(8) -AOM_SAD_8xHEIGHTx3_MSA(8) -AOM_SAD_8xHEIGHTx8_MSA(8) AOM_SAD_8xHEIGHTx4D_MSA(8) AOM_AVGSAD_8xHEIGHT_MSA(8) // 8x4 AOM_SAD_8xHEIGHT_MSA(4) -AOM_SAD_8xHEIGHTx3_MSA(4) -AOM_SAD_8xHEIGHTx8_MSA(4) AOM_SAD_8xHEIGHTx4D_MSA(4) AOM_AVGSAD_8xHEIGHT_MSA(4) // 4x8 AOM_SAD_4xHEIGHT_MSA(8) -AOM_SAD_4xHEIGHTx3_MSA(8) -AOM_SAD_4xHEIGHTx8_MSA(8) AOM_SAD_4xHEIGHTx4D_MSA(8) AOM_AVGSAD_4xHEIGHT_MSA(8) // 4x4 AOM_SAD_4xHEIGHT_MSA(4) -AOM_SAD_4xHEIGHTx3_MSA(4) -AOM_SAD_4xHEIGHTx8_MSA(4) AOM_SAD_4xHEIGHTx4D_MSA(4) AOM_AVGSAD_4xHEIGHT_MSA(4) /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c index 3eb85107d..a8ee85b6b 100644 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" #include "aom_dsp/mips/macros_msa.h" #include "aom_dsp/variance.h" diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c index 37b89765d..bfed773ac 100644 --- a/third_party/aom/aom_dsp/mips/subtract_msa.c +++ b/third_party/aom/aom_dsp/mips/subtract_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h deleted file mode 100644 index cba5d4445..000000000 --- a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ -#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" - -#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ - { \ - v8i16 k0_m = __msa_fill_h(cnst0); \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = (v4i32)__msa_fill_h(cnst1); \ - k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ - \ - ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ - ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ - DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ - \ - DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ - } - -#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ - dst1, dst2, dst3) \ - { \ - v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ - v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ - \ - DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ - tp4_m); \ - DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ - tp8_m); \ - BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ - BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ - SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ - SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ - dst1, dst2, dst3); \ - } - -#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ - ({ \ - v8i16 dst_m; \ - v4i32 tp0_m, tp1_m; \ - \ - DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ - SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ - dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ - \ - dst_m; \ - }) - -#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ - { \ - v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ - v8i16 madd_s0_m, madd_s1_m; \ - \ - ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ - madd0_m, madd1_m, madd2_m, madd3_m); \ - SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ - } - -#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ - out2, out3) \ - { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ - cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ - cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ - } -#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c index 745fdfc9c..065c09ac5 100644 --- a/third_party/aom/aom_dsp/mips/variance_msa.c +++ b/third_party/aom/aom_dsp/mips/variance_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define CALC_MSE_B(src, ref, var) \ |