summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/mips
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-19 21:52:15 -0500
committertrav90 <travawine@palemoon.org>2018-10-19 21:52:20 -0500
commitbbcc64772580c8a979288791afa02d30bc476d2e (patch)
tree437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/aom_dsp/mips
parent14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff)
downloadUXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/aom_dsp/mips')
-rw-r--r--third_party/aom/aom_dsp/mips/add_noise_msa.c3
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c704
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c605
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c677
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c4
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_msa.c630
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c4
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c233
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_msa.h45
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.h4
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c256
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c802
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c646
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c998
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_dspr2.c1379
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/convolve_common_dspr2.h15
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c928
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_txfm_msa.c238
-rw-r--r--third_party/aom/aom_dsp/mips/fwd_txfm_msa.h381
-rw-r--r--third_party/aom/aom_dsp/mips/idct16x16_msa.c486
-rw-r--r--third_party/aom/aom_dsp/mips/idct32x32_msa.c730
-rw-r--r--third_party/aom/aom_dsp/mips/idct4x4_msa.c99
-rw-r--r--third_party/aom/aom_dsp/mips/idct8x8_msa.c117
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred_msa.c3
-rw-r--r--third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h82
-rw-r--r--third_party/aom/aom_dsp/mips/inv_txfm_msa.h412
-rw-r--r--third_party/aom/aom_dsp/mips/itrans16_dspr2.c1190
-rw-r--r--third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c1042
-rw-r--r--third_party/aom/aom_dsp/mips/itrans32_dspr2.c1030
-rw-r--r--third_party/aom/aom_dsp/mips/itrans4_dspr2.c342
-rw-r--r--third_party/aom/aom_dsp/mips/itrans8_dspr2.c645
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_16_msa.c23
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h3
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h3
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h3
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c12
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c3
-rw-r--r--third_party/aom/aom_dsp/mips/macros_msa.h3
-rw-r--r--third_party/aom/aom_dsp/mips/sad_msa.c733
-rw-r--r--third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c3
-rw-r--r--third_party/aom/aom_dsp/mips/subtract_msa.c3
-rw-r--r--third_party/aom/aom_dsp/mips/txfm_macros_msa.h97
-rw-r--r--third_party/aom/aom_dsp/mips/variance_msa.c3
50 files changed, 67 insertions, 15573 deletions
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
index 4c6e201e1..96d04cff0 100644
--- a/third_party/aom/aom_dsp/mips/add_noise_msa.c
+++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c
@@ -10,7 +10,8 @@
*/
#include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "aom_dsp/mips/macros_msa.h"
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
char blackclamp[16], char whiteclamp[16],
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
deleted file mode 100644
index 847394a3d..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst0, dst1, dst2, dst3, res2, res3;
- v16u8 mask0, mask1, mask2, mask3;
- v8i16 filt, res0, res1;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, res0, res1);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SRARI_H2_SH(res0, res1, FILTER_BITS);
- SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- XORI_B2_128_UB(res2, res3);
- AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8i16 filt, vec0, vec1, vec2, vec3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, vec0, vec1);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, vec2, vec3);
- SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
- SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
- PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
- res3);
- ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
- XORI_B2_128_UB(res0, res2);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
- AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
- ST4x8_UB(res0, res2, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height >> 1; loop_cnt--;) {
- LD_SB2(src, src_stride, src0, src2);
- LD_SB2(src + 8, src_stride, src1, src3);
- src += (2 * src_stride);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
- vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- LD_UB2(dst, dst_stride, dst0, dst1);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
- dst += dst_stride;
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
- vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- LD_UB2(dst, 16, dst1, dst2);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- for (cnt = 0; cnt < 2; ++cnt) {
- src0 = LD_SB(&src[cnt << 5]);
- src2 = LD_SB(&src[16 + (cnt << 5)]);
- src3 = LD_SB(&src[24 + (cnt << 5)]);
- src1 = __msa_sldi_b(src2, src0, 8);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
- vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
- vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
- vec1, vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
- vec1, vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
- v8u16 vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
- SRARI_H2_UH(vec2, vec3, FILTER_BITS);
- PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8u16 vec4, vec5, vec6, vec7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
- vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
- res3);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v8u16 vec0, vec1, vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_8x8mult_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v8u16 vec0, vec1, vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- if (16 == height) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
- dst_stride);
- }
-}
-
-static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter, height);
- }
-}
-
-static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
- dst += dst_stride;
-
- for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- src4 = LD_SB(src);
- src6 = LD_SB(src + 16);
- src7 = LD_SB(src + 24);
- src5 = __msa_sldi_b(src6, src4, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- LD_UB2(dst, 16, dst0, dst1);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
- dst += dst_stride;
- LD_UB2(dst, 16, dst2, dst3);
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- for (loop_cnt = height; loop_cnt--;) {
- LD_SB4(src, 16, src0, src2, src4, src6);
- src7 = LD_SB(src + 56);
- SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
- PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
- PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
- PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
- dst += dst_stride;
- }
-}
-
-void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- int8_t cnt, filt_hor[8];
-
- assert(x_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0) {
- switch (w) {
- case 4:
- common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3], h);
- break;
- case 8:
- common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3], h);
- break;
- case 16:
- common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3], h);
- break;
- case 32:
- common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3], h);
- break;
- case 64:
- common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3], h);
- break;
- default:
- aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, h);
- break;
- case 8:
- common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, h);
- break;
- case 16:
- common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, h);
- break;
- case 32:
- common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, h);
- break;
- case 64:
- common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, h);
- break;
- default:
- aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
deleted file mode 100644
index bed600d5b..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
- vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
- vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
- SRARI_H2_SH(res0, res1, FILTER_BITS);
- SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
- XORI_B2_128_UB(tmp0, tmp1);
- AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
- ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out5 = hz_out9;
- vec0 = vec2;
- vec1 = vec3;
- vec2 = vec4;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
- ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
- tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
- tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- hz_out6 = hz_out10;
- out0 = out2;
- out1 = out3;
- out2 = out8;
- out4 = out6;
- out5 = out7;
- out6 = out9;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 8; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 dst0, dst1, dst2, dst3, res0, res1;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
- SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
- hz_out3, hz_out5, 8);
- hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
- tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
- res3);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- } else if (8 == height) {
- common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_SB(src);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- } else {
- common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
- src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
- dst += dst_stride;
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- int8_t cnt, filt_hor[8], filt_ver[8];
-
- assert(x_step_q4 == 16);
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], h);
- break;
- case 8:
- common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], h);
- break;
- case 16:
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 32:
- common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 64:
- common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- default:
- aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
- aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- } else {
- switch (w) {
- case 4:
- common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor,
- filt_ver, h);
- break;
- case 8:
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor,
- filt_ver, h);
- break;
- case 16:
- common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor,
- filt_ver, h);
- break;
- case 32:
- common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor,
- filt_ver, h);
- break;
- case 64:
- common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor,
- filt_ver, h);
- break;
- default:
- aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
deleted file mode 100644
index dae771104..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, out;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
- v16i8 src10998, filt0, filt1, filt2, filt3;
- v8i16 filt, out10, out32;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
- src4332, src6554);
- XORI_B3_128_SB(src2110, src4332, src6554);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
- XORI_B2_128_SB(src8776, src10998);
- out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
- filt1, filt2, filt3);
- out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
- filt1, filt2, filt3);
- SRARI_H2_SH(out10, out32, FILTER_BITS);
- SAT_SH2_SH(out10, out32, 7);
- out = PCKEV_XORI128_UB(out10, out32);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
- out = __msa_aver_u_b(out, dst0);
-
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src2110 = src6554;
- src4332 = src8776;
- src6554 = src10998;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
- v8i16 filt, out0, out1, out2, out3;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src7, src8, src9, src10);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
- filt2, filt3);
- out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
- filt2, filt3);
- out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
- filt2, filt3);
- out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_and_aver_dst_16w_mult_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height, int32_t width) {
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16i8 filt0, filt1, filt2, filt3;
- v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src_tmp += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
- src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
- src_tmp += (4 * src_stride);
-
- LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src7, src8, src9, src10);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
- out3_r, tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
- dst2, dst3);
- ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
- dst_tmp += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 16);
-}
-
-static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 32);
-}
-
-static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 64);
-}
-
-static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
- v16i8 src10_r, src32_r, src21_r, src43_r;
- v8i16 filt;
- v8u16 tmp0, tmp1;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- src4 = LD_SB(src);
- src += src_stride;
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, dst0);
-
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
- v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
- v16u8 src2110, src4332, src6554, src8776, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
- dst3);
- ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
- src76_r, src87_r);
- ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
- src76_r, src2110, src4332, src6554, src8776);
- DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16u8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
- ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
- dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_8x8mult_msa(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
- src += (8 * src_stride);
- LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
-
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
- vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
- vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
- dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src8;
- }
-}
-
-static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter, height);
- }
-}
-
-static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
- dst += dst_stride;
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
- dst += dst_stride;
-
- src0 = src4;
- }
-}
-
-static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_UB2(src, 16, src0, src5);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
- LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
- LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
- src += (4 * src_stride);
-
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
-
- ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
- ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
- ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src4;
- src5 = src9;
- }
-}
-
-static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5;
- v16u8 src6, src7, src8, src9, src10, src11, filt0;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8u16 filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_UB4(src, 16, src0, src3, src6, src9);
- src += src_stride;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_UB2(src, src_stride, src1, src2);
- LD_UB2(dst, dst_stride, dst0, dst1);
- LD_UB2(src + 16, src_stride, src4, src5);
- LD_UB2(dst + 16, dst_stride, dst2, dst3);
- LD_UB2(src + 32, src_stride, src7, src8);
- LD_UB2(dst + 32, dst_stride, dst4, dst5);
- LD_UB2(src + 48, src_stride, src10, src11);
- LD_UB2(dst + 48, dst_stride, dst6, dst7);
- src += (2 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
- ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
- ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
- ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
-
- ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
- ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
- dst += (2 * dst_stride);
-
- src0 = src2;
- src3 = src5;
- src6 = src8;
- src9 = src11;
- }
-}
-
-void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- int8_t cnt, filt_ver[8];
-
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_ver[3], h);
- break;
- case 8:
- common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_ver[3], h);
- break;
- case 16:
- common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_ver[3], h);
- break;
- case 32:
- common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_ver[3], h);
- break;
- case 64:
- common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_ver[3], h);
- break;
- default:
- aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_ver, h);
- break;
- case 8:
- common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_ver, h);
- break;
- case 16:
- common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_ver, h);
-
- break;
- case 32:
- common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_ver, h);
- break;
- case 64:
- common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_ver, h);
- break;
- default:
- aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
index fc3a823c5..363fad308 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -10,7 +10,9 @@
*/
#include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/aom_convolve_msa.h"
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
deleted file mode 100644
index a4d594931..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-const uint8_t mc_filt_mask_arr[16 * 3] = {
- /* 8 width cases */
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- /* 4 width cases */
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
- /* 4 width cases */
- 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
-};
-
-static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
- out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
- SAT_SH2_SH(tmp0, tmp1, 7);
- out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out5 = hz_out9;
- out0 = out2;
- out1 = out3;
- out2 = out4;
- }
-}
-
-static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
- ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- XORI_B4_128_SB(src7, src8, src9, src10);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
- tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
- tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
- vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(vec0, vec1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out6 = hz_out10;
- out0 = out2;
- out1 = out3;
- out2 = out8;
- out4 = out6;
- out5 = out7;
- out6 = out9;
- }
-}
-
-static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 8; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- v16i8 res0, res1, res2, res3;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
- SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
- hz_out3, hz_out5, 8);
- hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
- vec5, vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- } else if (8 == height) {
- common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- }
-}
-
-static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
- v16u8 filt_hz, filt_vt, vec0;
- v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_SB(src);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp4 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
- PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp5 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp6 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp7 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp8 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
- PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- } else {
- common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- }
-}
-
-static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int32_t x_step_q4, const int16_t *filter_y,
- int32_t y_step_q4, int32_t w, int32_t h) {
- int8_t cnt, filt_hor[8], filt_ver[8];
-
- assert(x_step_q4 == 16);
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], (int32_t)h);
- break;
- case 8:
- common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], (int32_t)h);
- break;
- case 16:
- common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], (int32_t)h);
- break;
- case 32:
- common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], (int32_t)h);
- break;
- case 64:
- common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, &filt_hor[3],
- &filt_ver[3], (int32_t)h);
- break;
- default:
- aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- break;
- }
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
- aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- } else {
- switch (w) {
- case 4:
- common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, filt_ver,
- (int32_t)h);
- break;
- case 8:
- common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, filt_ver,
- (int32_t)h);
- break;
- case 16:
- common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, filt_ver,
- (int32_t)h);
- break;
- case 32:
- common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, filt_ver,
- (int32_t)h);
- break;
- case 64:
- common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filt_hor, filt_ver,
- (int32_t)h);
- break;
- default:
- aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
index f7bdfc2bd..aa962b41f 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -10,7 +10,9 @@
*/
#include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/aom_convolve_msa.h"
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
deleted file mode 100644
index 75f8c7ea8..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint32_t out0, out1, out2, out3;
- v16u8 src0, src1, src2, src3;
- v16u8 dst0, dst1, dst2, dst3;
-
- if (0 == (height % 4)) {
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
- dst2, dst3);
-
- out0 = __msa_copy_u_w((v4i32)dst0, 0);
- out1 = __msa_copy_u_w((v4i32)dst1, 0);
- out2 = __msa_copy_u_w((v4i32)dst2, 0);
- out3 = __msa_copy_u_w((v4i32)dst3, 0);
- SW4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == (height % 2)) {
- for (cnt = (height / 2); cnt--;) {
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
-
- LD_UB2(dst, dst_stride, dst0, dst1);
-
- AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-
- out0 = __msa_copy_u_w((v4i32)dst0, 0);
- out1 = __msa_copy_u_w((v4i32)dst1, 0);
- SW(out0, dst);
- dst += dst_stride;
- SW(out1, dst);
- dst += dst_stride;
- }
- }
-}
-
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
- int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint64_t out0, out1, out2, out3;
- v16u8 src0, src1, src2, src3;
- v16u8 dst0, dst1, dst2, dst3;
-
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
- dst2, dst3);
-
- out0 = __msa_copy_u_d((v2i64)dst0, 0);
- out1 = __msa_copy_u_d((v2i64)dst1, 0);
- out2 = __msa_copy_u_d((v2i64)dst2, 0);
- out3 = __msa_copy_u_d((v2i64)dst3, 0);
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-
- for (cnt = (height / 8); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
- dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
- dst6, dst7);
- ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
- dst += (8 * dst_stride);
- }
-}
-
-static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint8_t *dst_dup = dst;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
- for (cnt = (height / 8); cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
- LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
- dst_dup += (4 * dst_stride);
- LD_UB4(src, src_stride, src8, src10, src12, src14);
- LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
- src += (4 * src_stride);
- LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
- LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
- dst_dup += (4 * dst_stride);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
- dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
- dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
- dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
- dst13, dst14, dst15);
-
- ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
- ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
- ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint8_t *dst_dup = dst;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(src, 16, src4, src5, src6, src7);
- src += src_stride;
- LD_UB4(src, 16, src8, src9, src10, src11);
- src += src_stride;
- LD_UB4(src, 16, src12, src13, src14, src15);
- src += src_stride;
-
- LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
- dst_dup += dst_stride;
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
- dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
- dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
- dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
- dst13, dst14, dst15);
-
- ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
- dst += dst_stride;
- ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
- dst += dst_stride;
- ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
- dst += dst_stride;
- ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
- dst += dst_stride;
- }
-}
-
-void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
-
- switch (w) {
- case 4: {
- avg_width4_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 8: {
- avg_width8_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 16: {
- avg_width16_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 32: {
- avg_width32_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 64: {
- avg_width64_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- default: {
- int32_t lp, cnt;
- for (cnt = h; cnt--;) {
- for (lp = 0; lp < w; ++lp) {
- dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
- }
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
index 1a0ae4d8d..a0627c074 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -31,23 +31,6 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
tmp_dpadd_0; \
})
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \
- filt_h1, filt_h2, filt_h3) \
- ({ \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
- v8i16 hz_out_m; \
- \
- VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
- vec3_m); \
- hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \
- filt_h1, filt_h2, filt_h3); \
- \
- hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
- hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
- \
- hz_out_m; \
- })
-
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
mask2, mask3, filt0, filt1, filt2, filt3, \
out0, out1) \
@@ -93,32 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
res7_m, out0, out1, out2, out3); \
}
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
- { \
- v16u8 tmp_m; \
- \
- tmp_m = PCKEV_XORI128_UB(in1, in0); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
- }
-
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
- { \
- v16u8 tmp_m; \
- \
- tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
- }
-
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
- stride) \
- { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
- PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
- }
#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
index 31159fdcd..d51bfa899 100644
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -13,7 +13,9 @@
#define AOM_COMMON_MIPS_DSPR2_H_
#include <assert.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
#include "aom/aom_integer.h"
#ifdef __cplusplus
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
deleted file mode 100644
index d557115b9..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t w,
- int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
-
- for (x = 0; x < w; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
- "extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
- [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y, int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
- prefetch_store(dst + dst_stride + 32);
-
- for (x = 0; x < 64; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
- "extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
- [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- uint32_t pos = 38;
-
- assert(y_step_q4 == 16);
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- case 8:
- case 16:
- case 32:
- convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
- w, h);
- break;
- case 64:
- prefetch_store(dst + 32);
- convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
- h);
- break;
- default:
- aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
deleted file mode 100644
index efbdcf60f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- int32_t Temp1, Temp2, Temp3, Temp4;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3;
- uint32_t tn1, tn2;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p3], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
- "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
- "extp %[Temp4], $ac2, 31 \n\t"
-
- "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
- "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
-
- /* clamp */
- "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
- "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
- "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
-
- "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
- "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
-
- "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
- "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [Temp4] "=&r"(Temp4)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2, tp3, tp4;
- uint32_t p1, p2, p3, p4, n1;
- uint32_t st0, st1;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
- "lbu %[Temp2], 0(%[dst]) \n\t"
- "lbu %[tp4], 2(%[dst]) \n\t"
-
- /* even 2. pixel */
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "lbux %[st1], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
- "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
- "sb %[Temp2], 0(%[dst]) \n\t"
- "sb %[tp4], 2(%[dst]) \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "balign %[tp3], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
-
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "lbu %[Temp2], 4(%[dst]) \n\t"
- "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sb %[Temp2], 4(%[dst]) \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tp3] \n\t"
- "preceu.ph.qbl %[p4], %[tp3] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tp1], 6(%[dst]) \n\t"
-
- /* odd 2. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "lbux %[st0], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- "lbu %[tp2], 1(%[dst]) \n\t"
- "lbu %[tp3], 3(%[dst]) \n\t"
- "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
-
- /* odd 3. pixel */
- "lbux %[st1], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
- "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tp4], 5(%[dst]) \n\t"
-
- /* odd 4. pixel */
- "sb %[tp2], 1(%[dst]) \n\t"
- "sb %[tp1], 6(%[dst]) \n\t"
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbu %[tp1], 7(%[dst]) \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
-
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
-
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
- "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
-
- /* store bytes */
- "sb %[tp3], 3(%[dst]) \n\t"
- "sb %[tp4], 5(%[dst]) \n\t"
- "sb %[tp1], 7(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h,
- int32_t count) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_store(dst_ptr + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
- "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
- "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
- "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
- "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
- "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
- "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
- "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
- "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
- "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
- "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
- "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
-
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
-
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
-
- "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
- "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
- "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
- [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
- [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
- prefetch_store(dst_ptr + dst_stride);
- prefetch_store(dst_ptr + dst_stride + 32);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
- "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
- "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
- "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
- "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
- "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
- "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
- "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
- "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
- "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
- "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
- "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
-
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
-
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
-
- "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
- "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
- "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
- [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
- [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- uint32_t pos = 38;
-
- assert(x_step_q4 == 16);
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- case 8:
- convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- case 16:
- convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h, 1);
- break;
- case 32:
- convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h, 2);
- break;
- case 64:
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- default:
- aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
index 066308315..08bf1ab30 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
@@ -12,7 +12,8 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
index dc51ab1cb..2a8f75938 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -12,7 +12,8 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
index 3367be01a..ac87936da 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -12,7 +12,8 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
deleted file mode 100644
index 3574da19f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t w,
- int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
-
- vector1b = ((const int32_t *)filter_y)[0];
- vector2b = ((const int32_t *)filter_y)[1];
- vector3b = ((const int32_t *)filter_y)[2];
- vector4b = ((const int32_t *)filter_y)[3];
-
- src -= 3 * src_stride;
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
-
- for (x = 0; x < w; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
-
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac0, 31 \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
- "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
- "extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
- [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
-
- vector1b = ((const int32_t *)filter_y)[0];
- vector2b = ((const int32_t *)filter_y)[1];
- vector3b = ((const int32_t *)filter_y)[2];
- vector4b = ((const int32_t *)filter_y)[3];
-
- src -= 3 * src_stride;
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
- prefetch_store(dst + dst_stride + 32);
-
- for (x = 0; x < 64; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
-
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac0, 31 \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
- "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
- "extp %[Temp2], $ac3, 31 \n\t"
- "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
- "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
- "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
- [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- if (((const int32_t *)filter_y)[0] == 0) {
- aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- uint32_t pos = 38;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- case 8:
- case 16:
- case 32:
- convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
- h);
- break;
- case 64:
- prefetch_store(dst + 32);
- convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
- h);
- break;
- default:
- aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
-
-void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- /* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
- int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(x_step_q4 == 16);
- assert(y_step_q4 == 16);
-
- if (intermediate_height < h) intermediate_height = h;
-
- aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
- x_step_q4, filter_y, y_step_q4, w, intermediate_height);
-
- aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride, int w,
- int h) {
- int x, y;
- uint32_t tp1, tp2, tn1;
- uint32_t tp3, tp4, tn2;
-
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- /* 1 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 0(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
-
- : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- case 8:
- /* 2 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 0(%[dst]) \n\t"
- "ulw %[tp3], 4(%[src]) \n\t"
- "ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- case 16:
- /* 4 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 0(%[dst]) \n\t"
- "ulw %[tp3], 4(%[src]) \n\t"
- "ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 8(%[src]) \n\t"
- "ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
- "ulw %[tp3], 12(%[src]) \n\t"
- "ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- case 32:
- /* 8 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 0(%[dst]) \n\t"
- "ulw %[tp3], 4(%[src]) \n\t"
- "ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 8(%[src]) \n\t"
- "ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
- "ulw %[tp3], 12(%[src]) \n\t"
- "ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 16(%[src]) \n\t"
- "ulw %[tp2], 16(%[dst]) \n\t"
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
- "ulw %[tp3], 20(%[src]) \n\t"
- "ulw %[tp4], 20(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 24(%[src]) \n\t"
- "ulw %[tp2], 24(%[dst]) \n\t"
- "sw %[tn1], 16(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 20(%[dst]) \n\t" /* store */
- "ulw %[tp3], 28(%[src]) \n\t"
- "ulw %[tp4], 28(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 24(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 28(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- case 64:
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- /* 16 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_load(src + src_stride + 64);
- prefetch_store(dst + dst_stride);
- prefetch_store(dst + dst_stride + 32);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 0(%[dst]) \n\t"
- "ulw %[tp3], 4(%[src]) \n\t"
- "ulw %[tp4], 4(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 8(%[src]) \n\t"
- "ulw %[tp2], 8(%[dst]) \n\t"
- "sw %[tn1], 0(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 4(%[dst]) \n\t" /* store */
- "ulw %[tp3], 12(%[src]) \n\t"
- "ulw %[tp4], 12(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 16(%[src]) \n\t"
- "ulw %[tp2], 16(%[dst]) \n\t"
- "sw %[tn1], 8(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 12(%[dst]) \n\t" /* store */
- "ulw %[tp3], 20(%[src]) \n\t"
- "ulw %[tp4], 20(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 24(%[src]) \n\t"
- "ulw %[tp2], 24(%[dst]) \n\t"
- "sw %[tn1], 16(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 20(%[dst]) \n\t" /* store */
- "ulw %[tp3], 28(%[src]) \n\t"
- "ulw %[tp4], 28(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 32(%[src]) \n\t"
- "ulw %[tp2], 32(%[dst]) \n\t"
- "sw %[tn1], 24(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 28(%[dst]) \n\t" /* store */
- "ulw %[tp3], 36(%[src]) \n\t"
- "ulw %[tp4], 36(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 40(%[src]) \n\t"
- "ulw %[tp2], 40(%[dst]) \n\t"
- "sw %[tn1], 32(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 36(%[dst]) \n\t" /* store */
- "ulw %[tp3], 44(%[src]) \n\t"
- "ulw %[tp4], 44(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 48(%[src]) \n\t"
- "ulw %[tp2], 48(%[dst]) \n\t"
- "sw %[tn1], 40(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 44(%[dst]) \n\t" /* store */
- "ulw %[tp3], 52(%[src]) \n\t"
- "ulw %[tp4], 52(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "ulw %[tp1], 56(%[src]) \n\t"
- "ulw %[tp2], 56(%[dst]) \n\t"
- "sw %[tn1], 48(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 52(%[dst]) \n\t" /* store */
- "ulw %[tp3], 60(%[src]) \n\t"
- "ulw %[tp4], 60(%[dst]) \n\t"
- "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
- "sw %[tn1], 56(%[dst]) \n\t" /* store */
- "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
- "sw %[tn2], 60(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- default:
- for (y = h; y > 0; --y) {
- for (x = 0; x < w; ++x) {
- dst[x] = (dst[x] + src[x] + 1) >> 1;
- }
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
deleted file mode 100644
index f6534b420..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3, Temp4;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3, p4;
- uint32_t n1, n2, n3, n4;
- uint32_t tn1, tn2;
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "ulw %[tn2], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn2] \n\t"
- "balign %[tn1], %[tn2], 3 \n\t"
- "balign %[tn2], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
- "preceu.ph.qbr %[n1], %[tp2] \n\t"
- "preceu.ph.qbl %[n2], %[tp2] \n\t"
- "preceu.ph.qbr %[n3], %[tn2] \n\t"
- "preceu.ph.qbl %[n4], %[tn2] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[n1], %[tn1] \n\t"
- "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
- "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
- "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
- "extp %[Temp4], $ac2, 31 \n\t"
-
- "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
- "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
-
- /* clamp */
- "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
- "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
- "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
-
- "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
- "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
-
- "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
- "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
- [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3, p4, n1;
- uint32_t tn1, tn2, tn3;
- uint32_t st0, st1;
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tn2], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
- "lbu %[Temp2], 0(%[dst]) \n\t"
- "lbu %[tn3], 2(%[dst]) \n\t"
-
- /* even 2. pixel */
- "preceu.ph.qbr %[p1], %[tn2] \n\t"
- "preceu.ph.qbl %[n1], %[tn2] \n\t"
- "ulw %[tn1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[tn1] \n\t"
- "lbux %[st1], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
- "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
- "sb %[Temp2], 0(%[dst]) \n\t"
- "sb %[tn3], 2(%[dst]) \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "balign %[tn3], %[tn1], 3 \n\t"
- "balign %[tn1], %[tn2], 3 \n\t"
- "balign %[tn2], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
-
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "lbu %[Temp2], 4(%[dst]) \n\t"
- "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sb %[Temp2], 4(%[dst]) \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tn2] \n\t"
- "preceu.ph.qbl %[p4], %[tn2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tp1], 6(%[dst]) \n\t"
-
- /* odd 2. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn1] \n\t"
- "preceu.ph.qbl %[n1], %[tn1] \n\t"
- "lbux %[st0], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- "lbu %[tp2], 1(%[dst]) \n\t"
- "lbu %[tn2], 3(%[dst]) \n\t"
- "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
-
- /* odd 3. pixel */
- "lbux %[st1], %[Temp2](%[cm]) \n\t"
- "preceu.ph.qbr %[p2], %[tn3] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
- "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "lbu %[tn3], 5(%[dst]) \n\t"
-
- /* odd 4. pixel */
- "sb %[tp2], 1(%[dst]) \n\t"
- "sb %[tp1], 6(%[dst]) \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbu %[tn1], 7(%[dst]) \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
-
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
-
- "lbux %[n1], %[Temp1](%[cm]) \n\t"
- "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
-
- /* store bytes */
- "sb %[tn2], 3(%[dst]) \n\t"
- "sb %[tn3], 5(%[dst]) \n\t"
- "sb %[tn1], 7(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
- [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h,
- int32_t count) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_store(dst_ptr + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
- "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
- "ulw %[qload2], 16(%[src]) \n\t"
- "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
- "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
- "ulw %[qload3], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
- "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
- "ulw %[qload3], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
- "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
-
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
-
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
-
- "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
- "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
- "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
- [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
- [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
- prefetch_store(dst_ptr + dst_stride);
- prefetch_store(dst_ptr + dst_stride + 32);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
- "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
- "ulw %[qload2], 16(%[src]) \n\t"
- "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
- "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
- "ulw %[qload3], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
- "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
- "ulw %[qload3], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
- "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
-
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
-
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
- "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
-
- "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
- "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
- "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
- [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
- [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- assert(x_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
-
- if (((const int32_t *)filter_x)[0] == 0) {
- aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- uint32_t pos = 38;
-
- src -= 3;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- case 8:
- convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- case 16:
- convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h, 1);
- break;
- case 32:
- convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h, 2);
- break;
- case 64:
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
- h);
- break;
- default:
- aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
- h);
- break;
- }
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
index dd4bc821a..af54b4264 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@@ -12,1389 +12,14 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
#if HAVE_DSPR2
-static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint8_t *dst_ptr;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3, Temp4;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3, p4;
- uint32_t tn1, tn2;
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- dst_ptr = dst;
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "ulw %[tn2], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn2] \n\t"
- "balign %[tn1], %[tn2], 3 \n\t"
- "balign %[tn2], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tn2] \n\t"
- "preceu.ph.qbl %[p4], %[tn2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn1] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp4], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[tn1], %[Temp2](%[cm]) \n\t"
- "lbux %[p2], %[Temp4](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[tn1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[tp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[p2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
- [dst_stride] "r"(dst_stride));
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
-}
-
-static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
- int32_t src_stride, uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint8_t *dst_ptr;
- uint32_t vector4a = 64;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2, tp3;
- uint32_t p1, p2, p3, p4, n1;
- uint8_t *odd_dst;
- uint32_t dst_pitch_2 = (dst_stride << 1);
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
-
- dst_ptr = dst;
- odd_dst = (dst_ptr + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp2], 0(%[src]) \n\t"
- "ulw %[tp1], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tp1] \n\t"
- "preceu.ph.qbl %[p4], %[tp1] \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "preceu.ph.qbr %[p1], %[tp3] \n\t"
- "preceu.ph.qbl %[n1], %[tp3] \n\t"
- "ulw %[tp2], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[tp2] \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
- "lbux %[tp3], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
- "extp %[p3], $ac1, 31 \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sb %[Temp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "sb %[tp3], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
-
- "ulw %[tp1], 1(%[src]) \n\t"
- "ulw %[tp3], 5(%[src]) \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbux %[tp2], %[p3](%[cm]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp3] \n\t"
- "preceu.ph.qbl %[p4], %[tp3] \n\t"
- "sb %[tp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "ulw %[tp2], 9(%[src]) \n\t"
-
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp1], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[n1], %[tp2] \n\t"
- "ulw %[Temp1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- /* odd 3. pixel */
- "lbux %[tp3], %[Temp2](%[cm]) \n\t"
- "preceu.ph.qbr %[p2], %[Temp1] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 4. pixel */
- "sb %[tp3], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "lbux %[n1], %[Temp1](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[p4], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[p2], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[n1], 0(%[odd_dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
- [dst_pitch_2] "r"(dst_pitch_2));
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
-}
-
-static void convolve_horiz_16_transposed_dspr2(
- const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) "
- "\n\t"
- "ulw %[qload2], 4(%[src]) "
- "\n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 1 */
- "mthi $zero, $ac1 "
- "\n\t"
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 2 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "ulw %[qload2], 8(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] "
- "\n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 3 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "ulw %[qload1], 12(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] "
- "\n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 4 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- " \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter12] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] "
- "\n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 5 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 16(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] "
- "\n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 6 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p4], %[qload2] "
- "\n\t"
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter12] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] "
- "\n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 7 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p1], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 20(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] "
- "\n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 8 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] "
- "\n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 1 */
- "mthi $zero, $ac3 "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] "
- "\n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] "
- "\n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter56] "
- "\n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] "
- "\n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) "
- "\n\t"
- "ulw %[qload2], 5(%[src]) "
- "\n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 2 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 9(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] "
- "\n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 3 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] "
- "\n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 4 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] "
- "\n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 5 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 17(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] "
- "\n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 6 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p4], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] "
- "\n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 7 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbl %[p1], %[qload2] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 21(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] "
- "\n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 8 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p2], %[filter12] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] "
- "\n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] "
- "\n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
- [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
- [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
- [dst_pitch_2] "r"(dst_pitch_2));
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
-
- dst_ptr += 1;
- }
-}
-
-static void convolve_horiz_64_transposed_dspr2(
- const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) "
- "\n\t"
- "ulw %[qload2], 4(%[src]) "
- "\n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 1 */
- "mthi $zero, $ac1 "
- "\n\t"
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 2 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "ulw %[qload2], 8(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] "
- "\n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 3 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "ulw %[qload1], 12(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] "
- "\n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] "
- "\n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 4 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- " \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter12] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] "
- "\n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] "
- "\n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 5 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 16(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] "
- "\n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] "
- "\n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 6 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p4], %[qload2] "
- "\n\t"
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter12] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] "
- "\n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] "
- "\n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 7 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p1], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 20(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] "
- "\n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] "
- "\n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 8 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] "
- "\n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] "
- "\n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 1 */
- "mthi $zero, $ac3 "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] "
- "\n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] "
- "\n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter56] "
- "\n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] "
- "\n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) "
- "\n\t"
- "ulw %[qload2], 5(%[src]) "
- "\n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 2 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 9(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] "
- "\n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] "
- "\n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 3 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] "
- "\n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] "
- "\n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 4 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] "
- "\n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] "
- "\n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 5 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 17(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] "
- "\n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] "
- "\n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 6 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p4], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] "
- "\n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] "
- "\n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 7 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbl %[p1], %[qload2] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 21(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] "
- "\n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] "
- "\n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 8 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p2], %[filter12] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] "
- "\n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] "
- "\n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] "
- "\n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] "
- "\n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
- [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
- [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
- [dst_pitch_2] "r"(dst_pitch_2));
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
-
- dst_ptr += 1;
- }
-}
-
-void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter, int w, int h) {
- int x, y, k;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- int sum = 0;
-
- for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
-
- dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- }
-
- src += src_stride;
- dst += 1;
- }
-}
-
-void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
- int x, y;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- dst[x * dst_stride] = src[x];
- }
-
- src += src_stride;
- dst += 1;
- }
-}
-
-void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
- int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
- uint32_t pos = 38;
-
- (void)x_step_q4;
-
- assert(x_step_q4 == 16);
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- if (intermediate_height < h) intermediate_height = h;
-
- /* copy the src to dst */
- if (filter_x[3] == 0x80) {
- copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
- intermediate_height, w, intermediate_height);
- } else if (((const int32_t *)filter_x)[0] == 0) {
- aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
- intermediate_height, filter_x, w, intermediate_height);
- } else {
- src -= (src_stride * 3 + 3);
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
-
- switch (w) {
- case 4:
- convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
- intermediate_height, filter_x,
- intermediate_height);
- break;
- case 8:
- convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
- intermediate_height, filter_x,
- intermediate_height);
- break;
- case 16:
- case 32:
- convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
- intermediate_height, filter_x,
- intermediate_height, (w / 16));
- break;
- case 64:
- prefetch_load(src + 32);
- convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
- intermediate_height, filter_x,
- intermediate_height);
- break;
- default:
- convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
- filter_x, w, intermediate_height);
- break;
- }
- }
-
- /* copy the src to dst */
- if (filter_y[3] == 0x80) {
- copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
- } else if (((const int32_t *)filter_y)[0] == 0) {
- aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
- filter_y, h, w);
- } else {
- switch (h) {
- case 4:
- convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
- dst_stride, filter_y, w);
- break;
- case 8:
- convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
- dst_stride, filter_y, w);
- break;
- case 16:
- case 32:
- convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
- dst_stride, filter_y, w, (h / 16));
- break;
- case 64:
- convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
- dst_stride, filter_y, w);
- break;
- default:
- convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
- filter_y, h, w);
- break;
- }
- }
-}
-
void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
index c60557617..f9c6879ab 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -12,7 +12,8 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
index d8a90b6ab..201e66427 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -12,7 +12,8 @@
#include <assert.h>
#include <stdio.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
index f8fd9e2b6..e7b8d531b 100644
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -14,7 +14,8 @@
#include <assert.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
@@ -29,18 +30,6 @@ void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4, int w,
int h);
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
-
void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter, int w,
int h);
diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
deleted file mode 100644
index 43dce8ba6..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
- int32_t src_stride,
- int16_t *temp_buff) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 step0, step1, step2, step3;
- v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
- v8i16 step0_1, step1_1, step2_1, step3_1;
-
- /* 1st and 2nd set */
- LD_SH4(input, src_stride, in0, in1, in2, in3);
- LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
- LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
- LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
- SLLI_4V(in0, in1, in2, in3, 2);
- SLLI_4V(in4, in5, in6, in7, 2);
- SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
- SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
- step3, in4, in5, in6, in7);
- BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
- step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
- ST_SH4(step0, step1, step2, step3, temp_buff, 8);
- ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
- ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
- ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
-
- /* 3rd and 4th set */
- LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
- LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
- LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
- LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
- SLLI_4V(in0, in1, in2, in3, 2);
- SLLI_4V(in4, in5, in6, in7, 2);
- SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
- SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
- step3, in4, in5, in6, in7);
- BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
- step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
- ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
- ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
- ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
- ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
-}
-
-static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 temp0, temp1;
-
- /* fdct even */
- LD_SH4(input, 8, in0, in1, in2, in3);
- LD_SH4(input + 96, 8, in12, in13, in14, in15);
- BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
- vec3, in12, in13, in14, in15);
- LD_SH4(input + 32, 8, in4, in5, in6, in7);
- LD_SH4(input + 64, 8, in8, in9, in10, in11);
- BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
- in8, in9, in10, in11);
-
- /* Stage 3 */
- ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
- BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
- DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp);
- ST_SH(temp1, temp + 512);
-
- DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 256);
- ST_SH(temp1, temp + 768);
-
- SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
- DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
- ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 128);
- ST_SH(temp1, temp + 896);
-
- SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
- DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 640);
- ST_SH(temp1, temp + 384);
-
- DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
- DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
- ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
- DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
- ADD2(in0, in1, in2, in3, vec0, vec7);
- DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 64);
- ST_SH(temp1, temp + 960);
-
- SUB2(in0, in1, in2, in3, in0, in2);
- DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 576);
- ST_SH(temp1, temp + 448);
-
- SUB2(in9, vec2, in14, vec5, vec2, vec5);
- DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
- SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
- DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 320);
- ST_SH(temp1, temp + 704);
-
- ADD2(in3, in2, in0, in1, vec3, vec4);
- DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
- FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
- ST_SH(temp0, temp + 192);
- ST_SH(temp1, temp + 832);
-}
-
-static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
- v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
- v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
- in20 = LD_SH(input + 32);
- in21 = LD_SH(input + 40);
- in26 = LD_SH(input + 80);
- in27 = LD_SH(input + 88);
-
- DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
- DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
- in18 = LD_SH(input + 16);
- in19 = LD_SH(input + 24);
- in28 = LD_SH(input + 96);
- in29 = LD_SH(input + 104);
-
- vec4 = in19 - in20;
- ST_SH(vec4, input + 32);
- vec4 = in18 - in21;
- ST_SH(vec4, input + 40);
- vec4 = in29 - in26;
- ST_SH(vec4, input + 80);
- vec4 = in28 - in27;
- ST_SH(vec4, input + 88);
-
- in21 = in18 + in21;
- in20 = in19 + in20;
- in27 = in28 + in27;
- in26 = in29 + in26;
-
- LD_SH4(input + 48, 8, in22, in23, in24, in25);
- DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
- DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
- in16 = LD_SH(input);
- in17 = LD_SH(input + 8);
- in30 = LD_SH(input + 112);
- in31 = LD_SH(input + 120);
-
- vec4 = in17 - in22;
- ST_SH(vec4, input + 16);
- vec4 = in16 - in23;
- ST_SH(vec4, input + 24);
- vec4 = in31 - in24;
- ST_SH(vec4, input + 96);
- vec4 = in30 - in25;
- ST_SH(vec4, input + 104);
-
- ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
- DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
- DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
- ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
- DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
- ADD2(in27, in26, in25, in24, in23, in20);
- DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr);
- ST_SH(vec4, temp_ptr + 960);
-
- SUB2(in27, in26, in25, in24, in22, in21);
- DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr + 448);
- ST_SH(vec4, temp_ptr + 512);
-
- SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
- DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
- SUB2(in26, in27, in24, in25, in23, in20);
- DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec4, temp_ptr + 704);
- ST_SH(vec5, temp_ptr + 256);
-
- ADD2(in26, in27, in24, in25, in22, in21);
- DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec4, temp_ptr + 192);
- ST_SH(vec5, temp_ptr + 768);
-
- LD_SH4(input + 16, 8, in22, in23, in20, in21);
- LD_SH4(input + 80, 8, in26, in27, in24, in25);
- in16 = in20;
- in17 = in21;
- DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
- DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
- SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
- DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
- ADD2(in28, in29, in31, in30, in16, in19);
- DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr + 832);
- ST_SH(vec4, temp_ptr + 128);
-
- SUB2(in28, in29, in31, in30, in17, in18);
- DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr + 320);
- ST_SH(vec4, temp_ptr + 640);
- ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
- DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
- SUB2(in29, in28, in30, in31, in16, in19);
- DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr + 576);
- ST_SH(vec4, temp_ptr + 384);
-
- ADD2(in29, in28, in30, in31, in17, in18);
- DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
- FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
- ST_SH(vec5, temp_ptr + 64);
- ST_SH(vec4, temp_ptr + 896);
-}
-
-static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
- int16_t *tmp_buf, int16_t *tmp_buf_big) {
- fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
- fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
- fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
-}
-
-static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
- int16_t *output) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
-
- LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
- step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
- ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
- ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
-
- /* 2nd set */
- LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
- step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
- ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
- (output + 8 * 8), 8);
- ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
-}
-
-static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
- int16_t *out) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
- v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
- v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
-
- /* fdct32 even */
- /* stage 2 */
- LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
- vec7, in8, in9, in10, in11, in12, in13, in14, in15);
- ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
- ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
-
- /* Stage 3 */
- UNPCK_SH_SW(vec0, vec0_l, vec0_r);
- UNPCK_SH_SW(vec1, vec1_l, vec1_r);
- UNPCK_SH_SW(vec2, vec2_l, vec2_r);
- UNPCK_SH_SW(vec3, vec3_l, vec3_r);
- UNPCK_SH_SW(vec4, vec4_l, vec4_r);
- UNPCK_SH_SW(vec5, vec5_l, vec5_r);
- UNPCK_SH_SW(vec6, vec6_l, vec6_r);
- UNPCK_SH_SW(vec7, vec7_l, vec7_r);
- ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
- tmp1_w, tmp2_w, tmp3_w);
- BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
- ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
- vec1_r, vec2_r, vec3_r);
-
- tmp3_w = vec0_r + vec3_r;
- vec0_r = vec0_r - vec3_r;
- vec3_r = vec1_r + vec2_r;
- vec1_r = vec1_r - vec2_r;
-
- DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
- vec4_r, tmp3_w, vec6_r, vec3_r);
- FDCT32_POSTPROC_NEG_W(vec4_r);
- FDCT32_POSTPROC_NEG_W(tmp3_w);
- FDCT32_POSTPROC_NEG_W(vec6_r);
- FDCT32_POSTPROC_NEG_W(vec3_r);
- PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
- ST_SH2(vec5, vec4, out, 8);
-
- DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
- vec4_r, tmp3_w, vec6_r, vec3_r);
- FDCT32_POSTPROC_NEG_W(vec4_r);
- FDCT32_POSTPROC_NEG_W(tmp3_w);
- FDCT32_POSTPROC_NEG_W(vec6_r);
- FDCT32_POSTPROC_NEG_W(vec3_r);
- PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
- ST_SH2(vec5, vec4, out + 16, 8);
-
- LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
- SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
- DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
- ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 32);
- ST_SH(in5, out + 56);
-
- SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
- DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 40);
- ST_SH(in5, out + 48);
-
- LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
- DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
- DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
- ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
- DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
- ADD2(in0, in1, in2, in3, vec0, vec7);
- DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 64);
- ST_SH(in5, out + 120);
-
- SUB2(in0, in1, in2, in3, in0, in2);
- DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 72);
- ST_SH(in5, out + 112);
-
- SUB2(in9, vec2, in14, vec5, vec2, vec5);
- DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
- SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
- DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 80);
- ST_SH(in5, out + 104);
-
- ADD2(in3, in2, in0, in1, vec3, vec4);
- DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
- FDCT_POSTPROC_2V_NEG_H(in4, in5);
- ST_SH(in4, out + 96);
- ST_SH(in5, out + 88);
-}
-
-static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
- /* fdct32 even */
- /* stage 2 */
- LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
- vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-
- /* Stage 3 */
- ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
- BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
- DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out);
- ST_SH(temp1, out + 8);
-
- DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 16);
- ST_SH(temp1, out + 24);
-
- SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
- DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
- ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 32);
- ST_SH(temp1, out + 56);
-
- SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
- DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 40);
- ST_SH(temp1, out + 48);
-
- DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
- DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
- ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
- DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
- ADD2(in0, in1, in2, in3, vec0, vec7);
- DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 64);
- ST_SH(temp1, out + 120);
-
- SUB2(in0, in1, in2, in3, in0, in2);
- DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 72);
- ST_SH(temp1, out + 112);
-
- SUB2(in9, vec2, in14, vec5, vec2, vec5);
- DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
- SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
- DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 80);
- ST_SH(temp1, out + 104);
-
- ADD2(in3, in2, in0, in1, vec3, vec4);
- DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
- FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
- ST_SH(temp0, out + 96);
- ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
- int16_t *out) {
- v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
- v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
- in20 = LD_SH(temp + 32);
- in21 = LD_SH(temp + 40);
- in26 = LD_SH(temp + 80);
- in27 = LD_SH(temp + 88);
-
- DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
- DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
- in18 = LD_SH(temp + 16);
- in19 = LD_SH(temp + 24);
- in28 = LD_SH(temp + 96);
- in29 = LD_SH(temp + 104);
-
- vec4 = in19 - in20;
- ST_SH(vec4, interm_ptr + 32);
- vec4 = in18 - in21;
- ST_SH(vec4, interm_ptr + 88);
- vec4 = in28 - in27;
- ST_SH(vec4, interm_ptr + 56);
- vec4 = in29 - in26;
- ST_SH(vec4, interm_ptr + 64);
-
- ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
- in22 = LD_SH(temp + 48);
- in23 = LD_SH(temp + 56);
- in24 = LD_SH(temp + 64);
- in25 = LD_SH(temp + 72);
-
- DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
- DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
- in16 = LD_SH(temp);
- in17 = LD_SH(temp + 8);
- in30 = LD_SH(temp + 112);
- in31 = LD_SH(temp + 120);
-
- vec4 = in17 - in22;
- ST_SH(vec4, interm_ptr + 40);
- vec4 = in30 - in25;
- ST_SH(vec4, interm_ptr + 48);
- vec4 = in31 - in24;
- ST_SH(vec4, interm_ptr + 72);
- vec4 = in16 - in23;
- ST_SH(vec4, interm_ptr + 80);
-
- ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
- DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
- DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-
- ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
- DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
- ADD2(in27, in26, in25, in24, in23, in20);
-
- DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec5, out);
- ST_SH(vec4, out + 120);
-
- SUB2(in27, in26, in25, in24, in22, in21);
-
- DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec5, out + 112);
- ST_SH(vec4, out + 8);
-
- SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
- DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
- SUB2(in26, in27, in24, in25, in23, in20);
-
- DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec4, out + 16);
- ST_SH(vec5, out + 104);
-
- ADD2(in26, in27, in24, in25, in22, in21);
- DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec4, out + 24);
- ST_SH(vec5, out + 96);
-
- in20 = LD_SH(interm_ptr + 32);
- in21 = LD_SH(interm_ptr + 88);
- in27 = LD_SH(interm_ptr + 56);
- in26 = LD_SH(interm_ptr + 64);
-
- in16 = in20;
- in17 = in21;
- DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
- DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
- in22 = LD_SH(interm_ptr + 40);
- in25 = LD_SH(interm_ptr + 48);
- in24 = LD_SH(interm_ptr + 72);
- in23 = LD_SH(interm_ptr + 80);
-
- SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
- DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
- ADD2(in28, in29, in31, in30, in16, in19);
- DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec5, out + 32);
- ST_SH(vec4, out + 88);
-
- SUB2(in28, in29, in31, in30, in17, in18);
- DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec5, out + 40);
- ST_SH(vec4, out + 80);
-
- ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
- DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
- SUB2(in29, in28, in30, in31, in16, in19);
-
- DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec5, out + 72);
- ST_SH(vec4, out + 48);
-
- ADD2(in29, in28, in30, in31, in17, in18);
-
- DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
- FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
- ST_SH(vec4, out + 56);
- ST_SH(vec5, out + 64);
-}
-
-static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
-
- /* 1st set */
- in0 = LD_SH(temp);
- in4 = LD_SH(temp + 32);
- in2 = LD_SH(temp + 64);
- in6 = LD_SH(temp + 96);
- in1 = LD_SH(temp + 128);
- in7 = LD_SH(temp + 152);
- in3 = LD_SH(temp + 192);
- in5 = LD_SH(temp + 216);
-
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- /* 2nd set */
- in0_1 = LD_SH(temp + 16);
- in1_1 = LD_SH(temp + 232);
- in2_1 = LD_SH(temp + 80);
- in3_1 = LD_SH(temp + 168);
- in4_1 = LD_SH(temp + 48);
- in5_1 = LD_SH(temp + 176);
- in6_1 = LD_SH(temp + 112);
- in7_1 = LD_SH(temp + 240);
-
- ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
- TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-
- /* 3rd set */
- in0 = LD_SH(temp + 8);
- in1 = LD_SH(temp + 136);
- in2 = LD_SH(temp + 72);
- in3 = LD_SH(temp + 200);
- in4 = LD_SH(temp + 40);
- in5 = LD_SH(temp + 208);
- in6 = LD_SH(temp + 104);
- in7 = LD_SH(temp + 144);
-
- ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
- 32);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
-
- /* 4th set */
- in0_1 = LD_SH(temp + 24);
- in1_1 = LD_SH(temp + 224);
- in2_1 = LD_SH(temp + 88);
- in3_1 = LD_SH(temp + 160);
- in4_1 = LD_SH(temp + 56);
- in5_1 = LD_SH(temp + 184);
- in6_1 = LD_SH(temp + 120);
- in7_1 = LD_SH(temp + 248);
-
- TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
- in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
- ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
- 32);
-}
-
-static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
- fdct8x32_1d_row_load_butterfly(temp, temp_buf);
- fdct8x32_1d_row_even(temp_buf, temp_buf);
- fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
- fdct8x32_1d_row_transpose_store(temp_buf, output);
-}
-
-static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
- int16_t *output) {
- fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
- fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
- fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
- fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_msa(const int16_t *input, int16_t *output,
- int32_t src_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
- DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
- /* column transform */
- for (i = 0; i < 4; ++i) {
- fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
- tmp_buf_big + (8 * i));
- }
-
- /* row transform */
- fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
-
- /* row transform */
- for (i = 1; i < 4; ++i) {
- fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
- }
-}
-
-static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
- /* fdct32 even */
- /* stage 2 */
- LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
- vec7, in8, in9, in10, in11, in12, in13, in14, in15);
- FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
- FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
- FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
- FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
- FDCT_POSTPROC_2V_NEG_H(in8, in9);
- FDCT_POSTPROC_2V_NEG_H(in10, in11);
- FDCT_POSTPROC_2V_NEG_H(in12, in13);
- FDCT_POSTPROC_2V_NEG_H(in14, in15);
-
- /* Stage 3 */
- ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-
- temp0 = in0 + in3;
- in0 = in0 - in3;
- in3 = in1 + in2;
- in1 = in1 - in2;
-
- DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
- ST_SH(temp0, out);
- ST_SH(temp1, out + 8);
-
- DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
- ST_SH(temp0, out + 16);
- ST_SH(temp1, out + 24);
-
- SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
- DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
- ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
- ST_SH(temp0, out + 32);
- ST_SH(temp1, out + 56);
-
- SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
- DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
- ST_SH(temp0, out + 40);
- ST_SH(temp1, out + 48);
-
- DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
- DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
- ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
- DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
- ADD2(in0, in1, in2, in3, vec0, vec7);
- DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
- ST_SH(temp0, out + 64);
- ST_SH(temp1, out + 120);
-
- SUB2(in0, in1, in2, in3, in0, in2);
- DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
- ST_SH(temp0, out + 72);
- ST_SH(temp1, out + 112);
-
- SUB2(in9, vec2, in14, vec5, vec2, vec5);
- DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
- SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
- DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
- ST_SH(temp0, out + 80);
- ST_SH(temp1, out + 104);
-
- ADD2(in3, in2, in0, in1, vec3, vec4);
- DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
- ST_SH(temp0, out + 96);
- ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
- int16_t *out) {
- v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
- v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
- v8i16 vec4, vec5;
-
- in20 = LD_SH(temp + 32);
- in21 = LD_SH(temp + 40);
- in26 = LD_SH(temp + 80);
- in27 = LD_SH(temp + 88);
-
- DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
- DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
- FDCT_POSTPROC_2V_NEG_H(in20, in21);
- FDCT_POSTPROC_2V_NEG_H(in26, in27);
-
- in18 = LD_SH(temp + 16);
- in19 = LD_SH(temp + 24);
- in28 = LD_SH(temp + 96);
- in29 = LD_SH(temp + 104);
-
- FDCT_POSTPROC_2V_NEG_H(in18, in19);
- FDCT_POSTPROC_2V_NEG_H(in28, in29);
-
- vec4 = in19 - in20;
- ST_SH(vec4, interm_ptr + 32);
- vec4 = in18 - in21;
- ST_SH(vec4, interm_ptr + 88);
- vec4 = in29 - in26;
- ST_SH(vec4, interm_ptr + 64);
- vec4 = in28 - in27;
- ST_SH(vec4, interm_ptr + 56);
-
- ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
- in22 = LD_SH(temp + 48);
- in23 = LD_SH(temp + 56);
- in24 = LD_SH(temp + 64);
- in25 = LD_SH(temp + 72);
-
- DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
- DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
- FDCT_POSTPROC_2V_NEG_H(in22, in23);
- FDCT_POSTPROC_2V_NEG_H(in24, in25);
-
- in16 = LD_SH(temp);
- in17 = LD_SH(temp + 8);
- in30 = LD_SH(temp + 112);
- in31 = LD_SH(temp + 120);
-
- FDCT_POSTPROC_2V_NEG_H(in16, in17);
- FDCT_POSTPROC_2V_NEG_H(in30, in31);
-
- vec4 = in17 - in22;
- ST_SH(vec4, interm_ptr + 40);
- vec4 = in30 - in25;
- ST_SH(vec4, interm_ptr + 48);
- vec4 = in31 - in24;
- ST_SH(vec4, interm_ptr + 72);
- vec4 = in16 - in23;
- ST_SH(vec4, interm_ptr + 80);
-
- ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
- DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
- DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
- ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
- DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
- ADD2(in27, in26, in25, in24, in23, in20);
- DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
- ST_SH(vec5, out);
- ST_SH(vec4, out + 120);
-
- SUB2(in27, in26, in25, in24, in22, in21);
- DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
- ST_SH(vec5, out + 112);
- ST_SH(vec4, out + 8);
-
- SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
- DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
- SUB2(in26, in27, in24, in25, in23, in20);
- DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
- ST_SH(vec4, out + 16);
- ST_SH(vec5, out + 104);
-
- ADD2(in26, in27, in24, in25, in22, in21);
- DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
- ST_SH(vec4, out + 24);
- ST_SH(vec5, out + 96);
-
- in20 = LD_SH(interm_ptr + 32);
- in21 = LD_SH(interm_ptr + 88);
- in27 = LD_SH(interm_ptr + 56);
- in26 = LD_SH(interm_ptr + 64);
-
- in16 = in20;
- in17 = in21;
- DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
- DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
- in22 = LD_SH(interm_ptr + 40);
- in25 = LD_SH(interm_ptr + 48);
- in24 = LD_SH(interm_ptr + 72);
- in23 = LD_SH(interm_ptr + 80);
-
- SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
- DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
- in16 = in28 + in29;
- in19 = in31 + in30;
- DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
- ST_SH(vec5, out + 32);
- ST_SH(vec4, out + 88);
-
- SUB2(in28, in29, in31, in30, in17, in18);
- DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
- ST_SH(vec5, out + 40);
- ST_SH(vec4, out + 80);
-
- ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
- DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
- SUB2(in29, in28, in30, in31, in16, in19);
- DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
- ST_SH(vec5, out + 72);
- ST_SH(vec4, out + 48);
-
- ADD2(in29, in28, in30, in31, in17, in18);
- DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
- ST_SH(vec4, out + 56);
- ST_SH(vec5, out + 64);
-}
-
-static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
- int16_t *output) {
- fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
- fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
- fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
- fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
- int32_t src_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
- DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
- /* column transform */
- for (i = 0; i < 4; ++i) {
- fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
- &tmp_buf_big[0] + (8 * i));
- }
-
- /* row transform */
- for (i = 0; i < 4; ++i) {
- fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
- out + (8 * i * 32));
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
deleted file mode 100644
index 7a285b7b8..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
- int32_t src_stride) {
- v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
- v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
- v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
- v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
- v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
- cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
- v8i16 coeff2 = {
- -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
- };
-
- LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
- in10, in11, in12, in13, in14, in15);
- SLLI_4V(in0, in1, in2, in3, 2);
- SLLI_4V(in4, in5, in6, in7, 2);
- SLLI_4V(in8, in9, in10, in11, 2);
- SLLI_4V(in12, in13, in14, in15, 2);
- ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
- ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
- FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
- tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
- SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
- SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
-
- tmp_ptr += 16;
-
- /* stp 1 */
- ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
- ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
-
- cnst4 = __msa_splati_h(coeff, 0);
- stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
-
- cnst5 = __msa_splati_h(coeff, 1);
- cnst5 = __msa_ilvev_h(cnst5, cnst4);
- stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
- stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
- stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
-
- /* stp2 */
- BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
- BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
- ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
- ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
- SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
- cnst0 = __msa_ilvev_h(cnst0, cnst1);
- stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
-
- cnst0 = __msa_splati_h(coeff, 4);
- cnst1 = __msa_ilvev_h(cnst1, cnst0);
- stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
-
- BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
- ILVRL_H2_SH(in15, in8, vec1, vec0);
- SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
- cnst0 = __msa_ilvev_h(cnst0, cnst1);
-
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr);
-
- cnst0 = __msa_splati_h(coeff2, 0);
- cnst0 = __msa_ilvev_h(cnst1, cnst0);
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr + 224);
-
- ILVRL_H2_SH(in14, in9, vec1, vec0);
- SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
- cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
- ST_SH(in8, tmp_ptr + 128);
-
- cnst1 = __msa_splati_h(coeff2, 2);
- cnst0 = __msa_ilvev_h(cnst0, cnst1);
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr + 96);
-
- SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
- cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
- stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
- cnst1 = __msa_splati_h(coeff, 3);
- cnst1 = __msa_ilvev_h(cnst0, cnst1);
- stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
- /* stp4 */
- ADD2(stp34, stp25, stp33, stp22, in13, in10);
-
- ILVRL_H2_SH(in13, in10, vec1, vec0);
- SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
- cnst0 = __msa_ilvev_h(cnst0, cnst1);
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr + 64);
-
- cnst0 = __msa_splati_h(coeff2, 1);
- cnst0 = __msa_ilvev_h(cnst1, cnst0);
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr + 160);
-
- SUB2(stp34, stp25, stp33, stp22, in12, in11);
- ILVRL_H2_SH(in12, in11, vec1, vec0);
- SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
- cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
- ST_SH(in8, tmp_ptr + 192);
-
- cnst1 = __msa_splati_h(coeff2, 3);
- cnst0 = __msa_ilvev_h(cnst0, cnst1);
- in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
- ST_SH(in8, tmp_ptr + 32);
-}
-
-void fdct16x8_1d_row(int16_t *input, int16_t *output) {
- v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-
- LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
- LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
- ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
- ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
- ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
- SRA_4V(in0, in1, in2, in3, 2);
- SRA_4V(in4, in5, in6, in7, 2);
- SRA_4V(in8, in9, in10, in11, 2);
- SRA_4V(in12, in13, in14, in15, 2);
- BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
- in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
- tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
- ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
- FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
- tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
- LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
- FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
- tmp1, in1, tmp2, in2, tmp3, in3);
- ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
- TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
- tmp5, in5, tmp6, in6, tmp7, in7);
- ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
-}
-
-void aom_fdct4x4_msa(const int16_t *input, int16_t *output,
- int32_t src_stride) {
- v8i16 in0, in1, in2, in3;
-
- LD_SH4(input, src_stride, in0, in1, in2, in3);
-
- /* fdct4 pre-process */
- {
- v8i16 vec, mask;
- v16i8 zero = { 0 };
- v16i8 one = __msa_ldi_b(1);
-
- mask = (v8i16)__msa_sldi_b(zero, one, 15);
- SLLI_4V(in0, in1, in2, in3, 4);
- vec = __msa_ceqi_h(in0, 0);
- vec = vec ^ 255;
- vec = mask & vec;
- in0 += vec;
- }
-
- AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
- TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
- AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
- TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
- ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
- SRA_4V(in0, in1, in2, in3, 2);
- PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
- ST_SH2(in0, in2, output, 8);
-}
-
-void aom_fdct8x8_msa(const int16_t *input, int16_t *output,
- int32_t src_stride) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
- LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
- SLLI_4V(in0, in1, in2, in3, 2);
- SLLI_4V(in4, in5, in6, in7, 2);
- AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
- in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
- in5, in6, in7);
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
- ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
-}
-
-void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[0] = LD_HADD(input, stride);
- out[1] = 0;
-}
-
-void aom_fdct16x16_msa(const int16_t *input, int16_t *output,
- int32_t src_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
-
- /* column transform */
- for (i = 0; i < 2; ++i) {
- fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
- }
-
- /* row transform */
- for (i = 0; i < 2; ++i) {
- fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
deleted file mode 100644
index ada25dffd..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define LD_HADD(psrc, stride) \
- ({ \
- v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
- v4i32 vec_w_m; \
- \
- LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
- ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
- LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
- ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
- in0_m, in4_m); \
- in0_m += in4_m; \
- \
- vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
- HADD_SW_S32(vec_w_m); \
- })
-
-#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
- v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 coeff_m = { \
- cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
- }; \
- \
- BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
- ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
- \
- SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
- cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
- vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
- \
- vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
- cnst2_m = __msa_splati_h(coeff_m, 2); \
- cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
- vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
- \
- SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \
- vec7_m, out0, out2, out1, out3); \
- }
-
-#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
- { \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- \
- SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
- SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
- AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
- in2, in3); \
- AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
- in6, in7); \
- }
-
-#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
- v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
- cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
- \
- /* FDCT stage1 */ \
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
- s3_m, s4_m, s5_m, s6_m, s7_m); \
- BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
- ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
- ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x1_m, x0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
- x2_m = -x2_m; \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- x2_m = __msa_splati_h(coeff_m, 2); \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- /* stage2 */ \
- ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
- \
- s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- /* stage3 */ \
- BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
- \
- /* stage4 */ \
- ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
- ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
- \
- SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x0_m, x1_m); \
- out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
- \
- x1_m = __msa_splati_h(coeff_m, 5); \
- x0_m = -x0_m; \
- x0_m = __msa_ilvev_h(x1_m, x0_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
- \
- x2_m = __msa_splati_h(coeff_m, 6); \
- x3_m = -x3_m; \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
- }
-
-#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
- v8i16 x0_m, x1_m, x2_m, x3_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
- cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
- \
- /* FDCT stage1 */ \
- BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
- s3_m, s4_m, s5_m, s6_m, s7_m); \
- BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
- ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
- ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
- SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x1_m, x0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
- x2_m = -x2_m; \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- x2_m = __msa_splati_h(coeff_m, 2); \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
- \
- /* stage2 */ \
- ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
- \
- s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
- s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
- \
- /* stage3 */ \
- BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
- \
- /* stage4 */ \
- ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
- ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
- \
- SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
- x1_m = __msa_ilvev_h(x0_m, x1_m); \
- out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
- \
- SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
- x2_m = __msa_ilvev_h(x3_m, x2_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
- \
- x1_m = __msa_splati_h(coeff_m, 5); \
- x0_m = -x0_m; \
- x0_m = __msa_ilvev_h(x1_m, x0_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
- \
- x2_m = __msa_splati_h(coeff_m, 6); \
- x3_m = -x3_m; \
- x2_m = __msa_ilvev_h(x2_m, x3_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
- }
-
-#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
- input7, out1, out3, out5, out7, out9, out11, out13, \
- out15) \
- { \
- v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
- v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
- v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
- v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
- v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
- v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
- -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \
- v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \
- cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \
- v8i16 coeff2_m = { \
- -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \
- }; \
- \
- /* stp 1 */ \
- ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
- ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
- \
- cnst4_m = __msa_splati_h(coeff_m, 0); \
- stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
- \
- cnst5_m = __msa_splati_h(coeff_m, 1); \
- cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
- stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
- stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
- stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
- \
- /* stp2 */ \
- BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \
- stp33_m); \
- BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \
- stp34_m); \
- \
- ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
- ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
- \
- SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff_m, 4); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
- \
- SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
- \
- cnst0_m = __msa_splati_h(coeff_m, 3); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
- \
- /* stp4 */ \
- BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \
- vec5_m); \
- BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
- stp31_m); \
- \
- ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- \
- out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff2_m, 0); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- \
- cnst1_m = __msa_splati_h(coeff2_m, 2); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- cnst0_m = __msa_splati_h(coeff2_m, 1); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- \
- ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
- SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- \
- cnst1_m = __msa_splati_h(coeff2_m, 3); \
- cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- }
-
-#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
- { \
- v8i16 tp0_m, tp1_m; \
- v8i16 one_m = __msa_ldi_h(1); \
- \
- tp0_m = __msa_clti_s_h(vec0, 0); \
- tp1_m = __msa_clti_s_h(vec1, 0); \
- vec0 += 1; \
- vec1 += 1; \
- tp0_m = one_m & tp0_m; \
- tp1_m = one_m & tp1_m; \
- vec0 += tp0_m; \
- vec1 += tp1_m; \
- vec0 >>= 2; \
- vec1 >>= 2; \
- }
-
-#define FDCT32_POSTPROC_NEG_W(vec) \
- { \
- v4i32 temp_m; \
- v4i32 one_m = __msa_ldi_w(1); \
- \
- temp_m = __msa_clti_s_w(vec, 0); \
- vec += 1; \
- temp_m = one_m & temp_m; \
- vec += temp_m; \
- vec >>= 2; \
- }
-
-#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
- { \
- v8i16 tp0_m, tp1_m; \
- v8i16 one = __msa_ldi_h(1); \
- \
- tp0_m = __msa_clei_s_h(vec0, 0); \
- tp1_m = __msa_clei_s_h(vec1, 0); \
- tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
- tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
- vec0 += 1; \
- vec1 += 1; \
- tp0_m = one & tp0_m; \
- tp1_m = one & tp1_m; \
- vec0 += tp0_m; \
- vec1 += tp1_m; \
- vec0 >>= 2; \
- vec1 >>= 2; \
- }
-
-#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
- const0, const1, out0, out1, out2, out3) \
- { \
- v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
- v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
- v4i32 k0_m = __msa_fill_w((int32_t)const0); \
- \
- s0_m = __msa_fill_w((int32_t)const1); \
- k0_m = __msa_ilvev_w(s0_m, k0_m); \
- \
- ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
- ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
- ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
- ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
- \
- DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
- DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
- tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
- tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
- tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
- tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
- out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
- \
- DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
- DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
- tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
- tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
- tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
- tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
- out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
- out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
- }
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
- int32_t src_stride);
-void fdct16x8_1d_row(int16_t *input, int16_t *output);
-#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c
deleted file mode 100644
index 0ea127f52..000000000
--- a/third_party/aom/aom_dsp/mips/idct16x16_msa.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
- v8i16 loc0, loc1, loc2, loc3;
- v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
- v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
- v8i16 tmp5, tmp6, tmp7;
-
- LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- input += 8;
- LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
- TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
- reg2, reg3, reg4, reg5, reg6, reg7);
- TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
- reg9, reg10, reg11, reg12, reg13, reg14, reg15);
- DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
- DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
- BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
- DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
- DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
- DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
- BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
- SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
- reg8);
- ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
- reg10);
-
- /* stage 2 */
- DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
- DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
- reg9 = reg1 - loc2;
- reg1 = reg1 + loc2;
- reg7 = reg15 - loc3;
- reg15 = reg15 + loc3;
-
- DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
- DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
- BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
- loc1 = reg15 + reg3;
- reg3 = reg15 - reg3;
- loc2 = reg2 + loc1;
- reg15 = reg2 - loc1;
-
- loc1 = reg1 + reg13;
- reg13 = reg1 - reg13;
- loc0 = reg0 + loc1;
- loc1 = reg0 - loc1;
- tmp6 = loc0;
- tmp7 = loc1;
- reg0 = loc2;
-
- DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
- DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
- loc0 = reg9 + reg5;
- reg5 = reg9 - reg5;
- reg2 = reg6 + loc0;
- reg1 = reg6 - loc0;
-
- loc0 = reg7 + reg11;
- reg11 = reg7 - reg11;
- loc1 = reg4 + loc0;
- loc2 = reg4 - loc0;
- tmp5 = loc1;
-
- DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
- BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
- reg10 = loc0;
- reg11 = loc1;
-
- DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
- BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
-
- reg13 = loc2;
-
- /* Transpose and store the output */
- reg12 = tmp5;
- reg14 = tmp6;
- reg3 = tmp7;
-
- /* transpose block */
- TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
- reg2, reg4, reg6, reg8, reg10, reg12, reg14);
- ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
-
- /* transpose block */
- TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
- reg13, reg11, reg5, reg7, reg9, reg1, reg15);
- ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
-}
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 loc0, loc1, loc2, loc3;
- v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
- v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
- v8i16 tmp5, tmp6, tmp7;
-
- /* load up 8x8 */
- LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- input += 8 * 16;
- /* load bottom 8x8 */
- LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
- DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
- DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
- BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
- DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
- DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
- DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
- BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
-
- reg0 = reg2 - loc1;
- reg2 = reg2 + loc1;
- reg12 = reg14 - loc0;
- reg14 = reg14 + loc0;
- reg4 = reg6 - loc3;
- reg6 = reg6 + loc3;
- reg8 = reg10 - loc2;
- reg10 = reg10 + loc2;
-
- /* stage 2 */
- DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
- DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
- reg9 = reg1 - loc2;
- reg1 = reg1 + loc2;
- reg7 = reg15 - loc3;
- reg15 = reg15 + loc3;
-
- DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
- DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
- BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
- loc1 = reg15 + reg3;
- reg3 = reg15 - reg3;
- loc2 = reg2 + loc1;
- reg15 = reg2 - loc1;
-
- loc1 = reg1 + reg13;
- reg13 = reg1 - reg13;
- loc0 = reg0 + loc1;
- loc1 = reg0 - loc1;
- tmp6 = loc0;
- tmp7 = loc1;
- reg0 = loc2;
-
- DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
- DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
- loc0 = reg9 + reg5;
- reg5 = reg9 - reg5;
- reg2 = reg6 + loc0;
- reg1 = reg6 - loc0;
-
- loc0 = reg7 + reg11;
- reg11 = reg7 - reg11;
- loc1 = reg4 + loc0;
- loc2 = reg4 - loc0;
- tmp5 = loc1;
-
- DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
- BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
- reg10 = loc0;
- reg11 = loc1;
-
- DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
- BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
- reg13 = loc2;
-
- /* Transpose and store the output */
- reg12 = tmp5;
- reg14 = tmp6;
- reg3 = tmp7;
-
- SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
- dst += (4 * dst_stride);
- SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
- dst += (4 * dst_stride);
- SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
- dst += (4 * dst_stride);
- SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
-}
-
-void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
- int16_t *out = out_arr;
-
- /* transform rows */
- for (i = 0; i < 2; ++i) {
- /* process 16 * 8 block */
- aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
- }
-
- /* transform columns */
- for (i = 0; i < 2; ++i) {
- /* process 8 * 16 block */
- aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
- dst_stride);
- }
-}
-
-void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- uint8_t i;
- DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
- int16_t *out = out_arr;
-
- /* process 16 * 8 block */
- aom_idct16_1d_rows_msa(input, out);
-
- /* short case just considers top 4 rows as valid output */
- out += 4 * 16;
- for (i = 12; i--;) {
- __asm__ __volatile__(
- "sw $zero, 0(%[out]) \n\t"
- "sw $zero, 4(%[out]) \n\t"
- "sw $zero, 8(%[out]) \n\t"
- "sw $zero, 12(%[out]) \n\t"
- "sw $zero, 16(%[out]) \n\t"
- "sw $zero, 20(%[out]) \n\t"
- "sw $zero, 24(%[out]) \n\t"
- "sw $zero, 28(%[out]) \n\t"
-
- :
- : [out] "r"(out));
-
- out += 16;
- }
-
- out = out_arr;
-
- /* transform columns */
- for (i = 0; i < 2; ++i) {
- /* process 8 * 16 block */
- aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
- dst_stride);
- }
-}
-
-void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- uint8_t i;
- int16_t out;
- v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
- v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-
- out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO(out, 6);
-
- vec = __msa_fill_h(out);
-
- for (i = 4; i--;) {
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- UNPCK_UB_SH(dst0, res0, res4);
- UNPCK_UB_SH(dst1, res1, res5);
- UNPCK_UB_SH(dst2, res2, res6);
- UNPCK_UB_SH(dst3, res3, res7);
- ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
- ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
- CLIP_SH4_0_255(res0, res1, res2, res3);
- CLIP_SH4_0_255(res4, res5, res6, res7);
- PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
- tmp2, tmp3);
- ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
- v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
- v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
- /* load input data */
- LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
- l7, l15);
- TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
- l7);
- TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
- l12, l13, l14, l15);
-
- /* ADST in horizontal */
- AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
- l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
- r12, r13, r14, r15);
-
- l1 = -r8;
- l3 = -r4;
- l13 = -r13;
- l15 = -r1;
-
- TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
- l6, l7);
- ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
- TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
- l13, l14, l15);
- ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
-}
-
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
- v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
- v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
- v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
- v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
- v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
- v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
- v16i8 zero = { 0 };
-
- r0 = LD_SH(input + 0 * 16);
- r3 = LD_SH(input + 3 * 16);
- r4 = LD_SH(input + 4 * 16);
- r7 = LD_SH(input + 7 * 16);
- r8 = LD_SH(input + 8 * 16);
- r11 = LD_SH(input + 11 * 16);
- r12 = LD_SH(input + 12 * 16);
- r15 = LD_SH(input + 15 * 16);
-
- /* stage 1 */
- k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
- MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
- k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
- MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
- BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
- k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
- k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
- MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-
- r1 = LD_SH(input + 1 * 16);
- r2 = LD_SH(input + 2 * 16);
- r5 = LD_SH(input + 5 * 16);
- r6 = LD_SH(input + 6 * 16);
- r9 = LD_SH(input + 9 * 16);
- r10 = LD_SH(input + 10 * 16);
- r13 = LD_SH(input + 13 * 16);
- r14 = LD_SH(input + 14 * 16);
-
- k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
- MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
- k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
- MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
- BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
- BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
- out1 = -out1;
- SRARI_H2_SH(out0, out1, 6);
- dst0 = LD_UB(dst + 0 * dst_stride);
- dst1 = LD_UB(dst + 15 * dst_stride);
- ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
- ADD2(res0, out0, res1, out1, res0, res1);
- CLIP_SH2_0_255(res0, res1);
- PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
- ST8x1_UB(res0, dst);
- ST8x1_UB(res1, dst + 15 * dst_stride);
-
- k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
- k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
- MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
- BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
- out8 = -out8;
-
- SRARI_H2_SH(out8, out9, 6);
- dst8 = LD_UB(dst + 1 * dst_stride);
- dst9 = LD_UB(dst + 14 * dst_stride);
- ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
- ADD2(res8, out8, res9, out9, res8, res9);
- CLIP_SH2_0_255(res8, res9);
- PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
- ST8x1_UB(res8, dst + dst_stride);
- ST8x1_UB(res9, dst + 14 * dst_stride);
-
- k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
- k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
- MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
- out4 = -out4;
- SRARI_H2_SH(out4, out5, 6);
- dst4 = LD_UB(dst + 3 * dst_stride);
- dst5 = LD_UB(dst + 12 * dst_stride);
- ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
- ADD2(res4, out4, res5, out5, res4, res5);
- CLIP_SH2_0_255(res4, res5);
- PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
- ST8x1_UB(res4, dst + 3 * dst_stride);
- ST8x1_UB(res5, dst + 12 * dst_stride);
-
- MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
- out13 = -out13;
- SRARI_H2_SH(out12, out13, 6);
- dst12 = LD_UB(dst + 2 * dst_stride);
- dst13 = LD_UB(dst + 13 * dst_stride);
- ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
- ADD2(res12, out12, res13, out13, res12, res13);
- CLIP_SH2_0_255(res12, res13);
- PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
- ST8x1_UB(res12, dst + 2 * dst_stride);
- ST8x1_UB(res13, dst + 13 * dst_stride);
-
- k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
- k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
- MADD_SHORT(out6, out7, k0, k3, out6, out7);
- SRARI_H2_SH(out6, out7, 6);
- dst6 = LD_UB(dst + 4 * dst_stride);
- dst7 = LD_UB(dst + 11 * dst_stride);
- ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
- ADD2(res6, out6, res7, out7, res6, res7);
- CLIP_SH2_0_255(res6, res7);
- PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
- ST8x1_UB(res6, dst + 4 * dst_stride);
- ST8x1_UB(res7, dst + 11 * dst_stride);
-
- MADD_SHORT(out10, out11, k0, k3, out10, out11);
- SRARI_H2_SH(out10, out11, 6);
- dst10 = LD_UB(dst + 6 * dst_stride);
- dst11 = LD_UB(dst + 9 * dst_stride);
- ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
- ADD2(res10, out10, res11, out11, res10, res11);
- CLIP_SH2_0_255(res10, res11);
- PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
- ST8x1_UB(res10, dst + 6 * dst_stride);
- ST8x1_UB(res11, dst + 9 * dst_stride);
-
- k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
- MADD_SHORT(h10, h11, k1, k2, out2, out3);
- SRARI_H2_SH(out2, out3, 6);
- dst2 = LD_UB(dst + 7 * dst_stride);
- dst3 = LD_UB(dst + 8 * dst_stride);
- ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
- ADD2(res2, out2, res3, out3, res2, res3);
- CLIP_SH2_0_255(res2, res3);
- PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
- ST8x1_UB(res2, dst + 7 * dst_stride);
- ST8x1_UB(res3, dst + 8 * dst_stride);
-
- MADD_SHORT(out14, out15, k1, k2, out14, out15);
- SRARI_H2_SH(out14, out15, 6);
- dst14 = LD_UB(dst + 5 * dst_stride);
- dst15 = LD_UB(dst + 10 * dst_stride);
- ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
- ADD2(res14, out14, res15, out15, res14, res15);
- CLIP_SH2_0_255(res14, res15);
- PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
- ST8x1_UB(res14, dst + 5 * dst_stride);
- ST8x1_UB(res15, dst + 10 * dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c
deleted file mode 100644
index f1ca757a0..000000000
--- a/third_party/aom/aom_dsp/mips/idct32x32_msa.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-static void idct32x8_row_transpose_store(const int16_t *input,
- int16_t *tmp_buf) {
- v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
- /* 1st & 2nd 8x8 */
- LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
- LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
- n3);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
- n7);
- ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
- ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
- ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
-
- /* 3rd & 4th 8x8 */
- LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
- LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
- n3);
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
- n7);
- ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
- ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
- ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
- ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
-}
-
-static void idct32x8_row_even_process_store(int16_t *tmp_buf,
- int16_t *tmp_eve_buf) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
- v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
- /* Even stage 1 */
- LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
- DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
- DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
- BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
- DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
- loc1 = vec3;
- loc0 = vec1;
-
- DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
- DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
- BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
- BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
- BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
- /* Even stage 2 */
- LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
- DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
- DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
- DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
- vec0 = reg0 + reg4;
- reg0 = reg0 - reg4;
- reg4 = reg6 + reg2;
- reg6 = reg6 - reg2;
- reg2 = reg1 + reg5;
- reg1 = reg1 - reg5;
- reg5 = reg7 + reg3;
- reg7 = reg7 - reg3;
- reg3 = vec0;
-
- vec1 = reg2;
- reg2 = reg3 + reg4;
- reg3 = reg3 - reg4;
- reg4 = reg5 - vec1;
- reg5 = reg5 + vec1;
-
- DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
- DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
- vec0 = reg0 - reg6;
- reg0 = reg0 + reg6;
- vec1 = reg7 - reg1;
- reg7 = reg7 + reg1;
-
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
- DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
- /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
- BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
- ST_SH(loc0, (tmp_eve_buf + 15 * 8));
- ST_SH(loc1, (tmp_eve_buf));
- ST_SH(loc2, (tmp_eve_buf + 14 * 8));
- ST_SH(loc3, (tmp_eve_buf + 8));
-
- BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
- ST_SH(loc0, (tmp_eve_buf + 13 * 8));
- ST_SH(loc1, (tmp_eve_buf + 2 * 8));
- ST_SH(loc2, (tmp_eve_buf + 12 * 8));
- ST_SH(loc3, (tmp_eve_buf + 3 * 8));
-
- /* Store 8 */
- BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
- ST_SH(loc0, (tmp_eve_buf + 11 * 8));
- ST_SH(loc1, (tmp_eve_buf + 4 * 8));
- ST_SH(loc2, (tmp_eve_buf + 10 * 8));
- ST_SH(loc3, (tmp_eve_buf + 5 * 8));
-
- BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
- ST_SH(loc0, (tmp_eve_buf + 9 * 8));
- ST_SH(loc1, (tmp_eve_buf + 6 * 8));
- ST_SH(loc2, (tmp_eve_buf + 8 * 8));
- ST_SH(loc3, (tmp_eve_buf + 7 * 8));
-}
-
-static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
- int16_t *tmp_odd_buf) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
- /* Odd stage 1 */
- reg0 = LD_SH(tmp_buf + 8);
- reg1 = LD_SH(tmp_buf + 7 * 8);
- reg2 = LD_SH(tmp_buf + 9 * 8);
- reg3 = LD_SH(tmp_buf + 15 * 8);
- reg4 = LD_SH(tmp_buf + 17 * 8);
- reg5 = LD_SH(tmp_buf + 23 * 8);
- reg6 = LD_SH(tmp_buf + 25 * 8);
- reg7 = LD_SH(tmp_buf + 31 * 8);
-
- DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
- DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
- DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
- DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
- vec0 = reg0 + reg3;
- reg0 = reg0 - reg3;
- reg3 = reg7 + reg4;
- reg7 = reg7 - reg4;
- reg4 = reg1 + reg2;
- reg1 = reg1 - reg2;
- reg2 = reg6 + reg5;
- reg6 = reg6 - reg5;
- reg5 = vec0;
-
- /* 4 Stores */
- ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
-
- SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
- ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
-
- /* 4 Stores */
- DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
- DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
- BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
-
- DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
- ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
- /* Odd stage 2 */
- /* 8 loads */
- reg0 = LD_SH(tmp_buf + 3 * 8);
- reg1 = LD_SH(tmp_buf + 5 * 8);
- reg2 = LD_SH(tmp_buf + 11 * 8);
- reg3 = LD_SH(tmp_buf + 13 * 8);
- reg4 = LD_SH(tmp_buf + 19 * 8);
- reg5 = LD_SH(tmp_buf + 21 * 8);
- reg6 = LD_SH(tmp_buf + 27 * 8);
- reg7 = LD_SH(tmp_buf + 29 * 8);
-
- DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
- DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
- DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
- DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
- /* 4 Stores */
- SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
- DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
- DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
-
- BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
-
- DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
- /* 4 Stores */
- ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
- BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
- ST_SH(reg0, (tmp_odd_buf + 13 * 8));
- ST_SH(reg1, (tmp_odd_buf + 14 * 8));
-
- DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
- ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
- /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
-
- /* Load 8 & Store 8 */
- LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
- LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
- SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
- SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
- /* Load 8 & Store 8 */
- LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
- LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
- SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
- SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct_butterfly_transpose_store(int16_t *tmp_buf,
- int16_t *tmp_eve_buf,
- int16_t *tmp_odd_buf, int16_t *dst) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
- /* FINAL BUTTERFLY : Dependency on Even & Odd */
- vec0 = LD_SH(tmp_odd_buf);
- vec1 = LD_SH(tmp_odd_buf + 9 * 8);
- vec2 = LD_SH(tmp_odd_buf + 14 * 8);
- vec3 = LD_SH(tmp_odd_buf + 6 * 8);
- loc0 = LD_SH(tmp_eve_buf);
- loc1 = LD_SH(tmp_eve_buf + 8 * 8);
- loc2 = LD_SH(tmp_eve_buf + 4 * 8);
- loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
-
- ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
- ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
- ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
- ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 4 * 8);
- vec1 = LD_SH(tmp_odd_buf + 13 * 8);
- vec2 = LD_SH(tmp_odd_buf + 10 * 8);
- vec3 = LD_SH(tmp_odd_buf + 3 * 8);
- loc0 = LD_SH(tmp_eve_buf + 2 * 8);
- loc1 = LD_SH(tmp_eve_buf + 10 * 8);
- loc2 = LD_SH(tmp_eve_buf + 6 * 8);
- loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
-
- ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
- ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
- ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
- ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 2 * 8);
- vec1 = LD_SH(tmp_odd_buf + 11 * 8);
- vec2 = LD_SH(tmp_odd_buf + 12 * 8);
- vec3 = LD_SH(tmp_odd_buf + 7 * 8);
- loc0 = LD_SH(tmp_eve_buf + 1 * 8);
- loc1 = LD_SH(tmp_eve_buf + 9 * 8);
- loc2 = LD_SH(tmp_eve_buf + 5 * 8);
- loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
-
- ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
- ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
- ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
- ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 5 * 8);
- vec1 = LD_SH(tmp_odd_buf + 15 * 8);
- vec2 = LD_SH(tmp_odd_buf + 8 * 8);
- vec3 = LD_SH(tmp_odd_buf + 1 * 8);
- loc0 = LD_SH(tmp_eve_buf + 3 * 8);
- loc1 = LD_SH(tmp_eve_buf + 11 * 8);
- loc2 = LD_SH(tmp_eve_buf + 7 * 8);
- loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
-
- ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
- ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
- ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
- ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
-
- /* Transpose : 16 vectors */
- /* 1st & 2nd 8x8 */
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
- n3);
- ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
- ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
-
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
- n7);
- ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
- ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
-
- /* 3rd & 4th 8x8 */
- LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
- LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
- TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
- n3);
- ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
- ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
-
- TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
- n7);
- ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
- ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
-}
-
-static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
- DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
- DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
- DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
- idct32x8_row_transpose_store(input, &tmp_buf[0]);
- idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
- idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
- idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
- output);
-}
-
-static void idct8x32_column_even_process_store(int16_t *tmp_buf,
- int16_t *tmp_eve_buf) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
- v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
- /* Even stage 1 */
- LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
- tmp_buf += (2 * 32);
-
- DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
- DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
- BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
- DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
- loc1 = vec3;
- loc0 = vec1;
-
- DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
- DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
- BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
- BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
- BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
- /* Even stage 2 */
- /* Load 8 */
- LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
- DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
- DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
- DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
- DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
- vec0 = reg0 + reg4;
- reg0 = reg0 - reg4;
- reg4 = reg6 + reg2;
- reg6 = reg6 - reg2;
- reg2 = reg1 + reg5;
- reg1 = reg1 - reg5;
- reg5 = reg7 + reg3;
- reg7 = reg7 - reg3;
- reg3 = vec0;
-
- vec1 = reg2;
- reg2 = reg3 + reg4;
- reg3 = reg3 - reg4;
- reg4 = reg5 - vec1;
- reg5 = reg5 + vec1;
-
- DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
- DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
- vec0 = reg0 - reg6;
- reg0 = reg0 + reg6;
- vec1 = reg7 - reg1;
- reg7 = reg7 + reg1;
-
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
- DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
- /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
- /* Store 8 */
- BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
- ST_SH2(loc1, loc3, tmp_eve_buf, 8);
- ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
-
- BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
- ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
- ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
-
- /* Store 8 */
- BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
- ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
- ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
-
- BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
- ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
- ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
-}
-
-static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
- int16_t *tmp_odd_buf) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
- /* Odd stage 1 */
- reg0 = LD_SH(tmp_buf + 32);
- reg1 = LD_SH(tmp_buf + 7 * 32);
- reg2 = LD_SH(tmp_buf + 9 * 32);
- reg3 = LD_SH(tmp_buf + 15 * 32);
- reg4 = LD_SH(tmp_buf + 17 * 32);
- reg5 = LD_SH(tmp_buf + 23 * 32);
- reg6 = LD_SH(tmp_buf + 25 * 32);
- reg7 = LD_SH(tmp_buf + 31 * 32);
-
- DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
- DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
- DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
- DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
- vec0 = reg0 + reg3;
- reg0 = reg0 - reg3;
- reg3 = reg7 + reg4;
- reg7 = reg7 - reg4;
- reg4 = reg1 + reg2;
- reg1 = reg1 - reg2;
- reg2 = reg6 + reg5;
- reg6 = reg6 - reg5;
- reg5 = vec0;
-
- /* 4 Stores */
- ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
- SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
- ST_SH2(vec0, vec1, tmp_odd_buf, 8);
-
- /* 4 Stores */
- DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
- DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
- BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
- DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
- ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
- /* Odd stage 2 */
- /* 8 loads */
- reg0 = LD_SH(tmp_buf + 3 * 32);
- reg1 = LD_SH(tmp_buf + 5 * 32);
- reg2 = LD_SH(tmp_buf + 11 * 32);
- reg3 = LD_SH(tmp_buf + 13 * 32);
- reg4 = LD_SH(tmp_buf + 19 * 32);
- reg5 = LD_SH(tmp_buf + 21 * 32);
- reg6 = LD_SH(tmp_buf + 27 * 32);
- reg7 = LD_SH(tmp_buf + 29 * 32);
-
- DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
- DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
- DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
- DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
- /* 4 Stores */
- SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
- DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
- DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
- BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
- DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
- ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
- /* 4 Stores */
- ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
- BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
- ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
- DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
- ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
- /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
- /* Load 8 & Store 8 */
- LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
- LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
- SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
- SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
- /* Load 8 & Store 8 */
- LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
- LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
- ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
- SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
- SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
- DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
- ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
- int16_t *tmp_odd_buf, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
- v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
- /* FINAL BUTTERFLY : Dependency on Even & Odd */
- vec0 = LD_SH(tmp_odd_buf);
- vec1 = LD_SH(tmp_odd_buf + 9 * 8);
- vec2 = LD_SH(tmp_odd_buf + 14 * 8);
- vec3 = LD_SH(tmp_odd_buf + 6 * 8);
- loc0 = LD_SH(tmp_eve_buf);
- loc1 = LD_SH(tmp_eve_buf + 8 * 8);
- loc2 = LD_SH(tmp_eve_buf + 4 * 8);
- loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
- SRARI_H4_SH(m0, m2, m4, m6, 6);
- AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
-
- SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
- SRARI_H4_SH(m0, m2, m4, m6, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
- m6);
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 4 * 8);
- vec1 = LD_SH(tmp_odd_buf + 13 * 8);
- vec2 = LD_SH(tmp_odd_buf + 10 * 8);
- vec3 = LD_SH(tmp_odd_buf + 3 * 8);
- loc0 = LD_SH(tmp_eve_buf + 2 * 8);
- loc1 = LD_SH(tmp_eve_buf + 10 * 8);
- loc2 = LD_SH(tmp_eve_buf + 6 * 8);
- loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
- SRARI_H4_SH(m1, m3, m5, m7, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
-
- SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
- SRARI_H4_SH(m1, m3, m5, m7, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
- m7);
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 2 * 8);
- vec1 = LD_SH(tmp_odd_buf + 11 * 8);
- vec2 = LD_SH(tmp_odd_buf + 12 * 8);
- vec3 = LD_SH(tmp_odd_buf + 7 * 8);
- loc0 = LD_SH(tmp_eve_buf + 1 * 8);
- loc1 = LD_SH(tmp_eve_buf + 9 * 8);
- loc2 = LD_SH(tmp_eve_buf + 5 * 8);
- loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
- SRARI_H4_SH(n0, n2, n4, n6, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
-
- SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
- SRARI_H4_SH(n0, n2, n4, n6, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
- n6);
-
- /* Load 8 & Store 8 */
- vec0 = LD_SH(tmp_odd_buf + 5 * 8);
- vec1 = LD_SH(tmp_odd_buf + 15 * 8);
- vec2 = LD_SH(tmp_odd_buf + 8 * 8);
- vec3 = LD_SH(tmp_odd_buf + 1 * 8);
- loc0 = LD_SH(tmp_eve_buf + 3 * 8);
- loc1 = LD_SH(tmp_eve_buf + 11 * 8);
- loc2 = LD_SH(tmp_eve_buf + 7 * 8);
- loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
- ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
- SRARI_H4_SH(n1, n3, n5, n7, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
-
- SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
- SRARI_H4_SH(n1, n3, n5, n7, 6);
- AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
- n7);
-}
-
-static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
- DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
- idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
- idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
- idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
- dst_stride);
-}
-
-void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
- int16_t *out_ptr = out_arr;
-
- /* transform rows */
- for (i = 0; i < 4; ++i) {
- /* process 32 * 8 block */
- idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
- }
-
- /* transform columns */
- for (i = 0; i < 4; ++i) {
- /* process 8 * 32 block */
- idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
- dst_stride);
- }
-}
-
-void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int32_t i;
- DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
- int16_t *out_ptr = out_arr;
-
- for (i = 32; i--;) {
- __asm__ __volatile__(
- "sw $zero, 0(%[out_ptr]) \n\t"
- "sw $zero, 4(%[out_ptr]) \n\t"
- "sw $zero, 8(%[out_ptr]) \n\t"
- "sw $zero, 12(%[out_ptr]) \n\t"
- "sw $zero, 16(%[out_ptr]) \n\t"
- "sw $zero, 20(%[out_ptr]) \n\t"
- "sw $zero, 24(%[out_ptr]) \n\t"
- "sw $zero, 28(%[out_ptr]) \n\t"
- "sw $zero, 32(%[out_ptr]) \n\t"
- "sw $zero, 36(%[out_ptr]) \n\t"
- "sw $zero, 40(%[out_ptr]) \n\t"
- "sw $zero, 44(%[out_ptr]) \n\t"
- "sw $zero, 48(%[out_ptr]) \n\t"
- "sw $zero, 52(%[out_ptr]) \n\t"
- "sw $zero, 56(%[out_ptr]) \n\t"
- "sw $zero, 60(%[out_ptr]) \n\t"
-
- :
- : [out_ptr] "r"(out_ptr));
-
- out_ptr += 32;
- }
-
- out_ptr = out_arr;
-
- /* rows: only upper-left 8x8 has non-zero coeff */
- idct32x8_1d_rows_msa(input, out_ptr);
-
- /* transform columns */
- for (i = 0; i < 4; ++i) {
- /* process 8 * 32 block */
- idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
- dst_stride);
- }
-}
-
-void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int32_t i;
- int16_t out;
- v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
- v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
-
- out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO(out, 6);
-
- vec = __msa_fill_h(out);
-
- for (i = 16; i--;) {
- LD_UB2(dst, 16, dst0, dst1);
- LD_UB2(dst + dst_stride, 16, dst2, dst3);
-
- UNPCK_UB_SH(dst0, res0, res4);
- UNPCK_UB_SH(dst1, res1, res5);
- UNPCK_UB_SH(dst2, res2, res6);
- UNPCK_UB_SH(dst3, res3, res7);
- ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
- ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
- CLIP_SH4_0_255(res0, res1, res2, res3);
- CLIP_SH4_0_255(res4, res5, res6, res7);
- PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
- tmp2, tmp3);
-
- ST_UB2(tmp0, tmp1, dst, 16);
- dst += dst_stride;
- ST_UB2(tmp2, tmp3, dst, 16);
- dst += dst_stride;
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c
deleted file mode 100644
index 274818baa..000000000
--- a/third_party/aom/aom_dsp/mips/idct4x4_msa.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 in0, in1, in2, in3;
- v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
-
- /* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in2, in3, in1);
- TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
- UNPCK_R_SH_SW(in0, in0_r);
- UNPCK_R_SH_SW(in2, in2_r);
- UNPCK_R_SH_SW(in3, in3_r);
- UNPCK_R_SH_SW(in1, in1_r);
- SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
-
- in0_r += in2_r;
- in3_r -= in1_r;
- in4_r = (in0_r - in3_r) >> 1;
- in1_r = in4_r - in1_r;
- in2_r = in4_r - in2_r;
- in0_r -= in1_r;
- in3_r += in2_r;
-
- TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
-
- in0_r += in1_r;
- in2_r -= in3_r;
- in4_r = (in0_r - in2_r) >> 1;
- in3_r = in4_r - in3_r;
- in1_r = in4_r - in1_r;
- in0_r -= in3_r;
- in2_r += in1_r;
-
- PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
- in2, in3);
- ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
-}
-
-void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int16_t a1, e1;
- v8i16 in1, in0 = { 0 };
-
- a1 = input[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
-
- in0 = __msa_insert_h(in0, 0, a1);
- in0 = __msa_insert_h(in0, 1, e1);
- in0 = __msa_insert_h(in0, 2, e1);
- in0 = __msa_insert_h(in0, 3, e1);
-
- in1 = in0 >> 1;
- in0 -= in1;
-
- ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
-}
-
-void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 in0, in1, in2, in3;
-
- /* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in1, in2, in3);
- /* rows */
- TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
- AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
- /* columns */
- TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
- AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
- /* rounding (add 2^3, divide by 2^4) */
- SRARI_H4_SH(in0, in1, in2, in3, 4);
- ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
-}
-
-void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int16_t out;
- v8i16 vec;
-
- out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO(out, 4);
- vec = __msa_fill_h(out);
-
- ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c
deleted file mode 100644
index 981c103cd..000000000
--- a/third_party/aom/aom_dsp/mips/idct8x8_msa.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
- /* load vector elements of 8x8 block */
- LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-
- /* rows transform */
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- /* 1D idct8x8 */
- AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- /* columns transform */
- TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- /* 1D idct8x8 */
- AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- /* final rounding (add 2^4, divide by 2^5) and shift */
- SRARI_H4_SH(in0, in1, in2, in3, 5);
- SRARI_H4_SH(in4, in5, in6, in7, 5);
- /* add block and store 8x8 */
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
- dst += (4 * dst_stride);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
- v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
- v4i32 tmp0, tmp1, tmp2, tmp3;
- v8i16 zero = { 0 };
-
- /* load vector elements of 8x8 block */
- LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
- TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-
- /* stage1 */
- ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
- k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
- k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
- DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
- SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
- PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
- PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
- BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
-
- /* stage2 */
- ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
- k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
- k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
- k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
- k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
- DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
- SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
- PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
- PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
- BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
-
- /* stage3 */
- s0 = __msa_ilvr_h(s6, s5);
-
- k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
- DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
- SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
- PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
-
- /* stage4 */
- BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
- in7);
- TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- /* final rounding (add 2^4, divide by 2^5) and shift */
- SRARI_H4_SH(in0, in1, in2, in3, 5);
- SRARI_H4_SH(in4, in5, in6, in7, 5);
-
- /* add block and store 8x8 */
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
- dst += (4 * dst_stride);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
- int32_t dst_stride) {
- int16_t out;
- int32_t val;
- v8i16 vec;
-
- out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
- out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
- val = ROUND_POWER_OF_TWO(out, 5);
- vec = __msa_fill_h(val);
-
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
- dst += (4 * dst_stride);
- AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
-}
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
index bcb9c9df9..9f25cc1ca 100644
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@@ -9,7 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/macros_msa.h"
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
deleted file mode 100644
index c69835173..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* Note: this macro expects a local int32_t named out to exist, and will write
- * to that variable. */
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \
- ({ \
- \
- int32_t tmp; \
- int dct_cost_rounding = DCT_CONST_ROUNDING; \
- int in = input; \
- \
- __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \
- "mtlo %[dct_cost_rounding], $ac1 " \
- " \n\t" \
- "mthi $zero, $ac1 " \
- " \n\t" \
- "madd $ac1, %[in], " \
- "%[cospi_16_64] \n\t" \
- "extp %[tmp], $ac1, " \
- "31 \n\t" \
- \
- /* out = dct_const_round_shift(out * cospi_16_64); */ \
- "mtlo %[dct_cost_rounding], $ac2 " \
- " \n\t" \
- "mthi $zero, $ac2 " \
- " \n\t" \
- "madd $ac2, %[tmp], " \
- "%[cospi_16_64] \n\t" \
- "extp %[out], $ac2, " \
- "31 \n\t" \
- \
- : [tmp] "=&r"(tmp), [out] "=r"(out) \
- : [in] "r"(in), \
- [dct_cost_rounding] "r"(dct_cost_rounding), \
- [cospi_16_64] "r"(cospi_16_64)); \
- out; \
- })
-
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
-void iadst4_dspr2(const int16_t *input, int16_t *output);
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
-void iadst8_dspr2(const int16_t *input, int16_t *output);
-void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
-void iadst16_dspr2(const int16_t *input, int16_t *output);
-
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
deleted file mode 100644
index 122667aa8..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_
-#define AOM_DSP_MIPS_INV_TXFM_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
- v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
- cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
- v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
- cospi_24_64, -cospi_24_64, 0, 0 }; \
- \
- SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
- cnst2_m = -cnst0_m; \
- ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
- SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
- cnst4_m = -cnst2_m; \
- ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
- \
- ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
- ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
- cnst2_m, cnst3_m, in7, in0, in4, in3); \
- \
- SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
- cnst2_m = -cnst0_m; \
- ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
- SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
- cnst4_m = -cnst2_m; \
- ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
- \
- ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
- ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
- \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
- cnst2_m, cnst3_m, in5, in2, in6, in1); \
- BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
- out7 = -s0_m; \
- out0 = s1_m; \
- \
- SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
- \
- ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
- cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- cnst1_m = cnst0_m; \
- \
- ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
- ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
- DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
- cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
- \
- SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
- cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
- \
- ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
- ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
- out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
- out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
- out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
- out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
- \
- out1 = -out1; \
- out3 = -out3; \
- out5 = -out5; \
- }
-
-#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \
- ({ \
- v8i16 out0_m, r0_m, r1_m; \
- \
- r0_m = __msa_fill_h(c0_h); \
- r1_m = __msa_fill_h(c1_h); \
- out0_m = __msa_ilvev_h(r1_m, r0_m); \
- \
- out0_m; \
- })
-
-#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
- { \
- uint8_t *dst_m = (uint8_t *)(dst); \
- v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
- v16i8 tmp0_m, tmp1_m; \
- v16i8 zero_m = { 0 }; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
- ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
- res0_m, res1_m, res2_m, res3_m); \
- ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
- res2_m, res3_m); \
- CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
- PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
- }
-
-#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v8i16 c0_m, c1_m, c2_m, c3_m; \
- v8i16 step0_m, step1_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
- c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
- step0_m = __msa_ilvr_h(in2, in0); \
- DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- \
- c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- step1_m = __msa_ilvr_h(in3, in1); \
- DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- \
- PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
- SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
- BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
- out0, out1, out2, out3); \
- }
-
-#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v8i16 res0_m, res1_m, c0_m, c1_m; \
- v8i16 k1_m, k2_m, k3_m, k4_m; \
- v8i16 zero_m = { 0 }; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v4i32 int0_m, int1_m, int2_m, int3_m; \
- v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
- -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
- \
- SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
- ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
- ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
- int0_m = tmp2_m + tmp1_m; \
- \
- SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
- ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- int1_m = tmp0_m + tmp1_m; \
- \
- c0_m = __msa_splati_h(mask_m, 6); \
- ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
- ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
- DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
- int2_m = tmp0_m + tmp1_m; \
- \
- c0_m = __msa_splati_h(mask_m, 6); \
- c0_m = __msa_ilvev_h(c0_m, k1_m); \
- \
- res0_m = __msa_ilvr_h((in1), (in3)); \
- tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
- int3_m = tmp2_m + tmp0_m; \
- \
- res0_m = __msa_ilvr_h((in2), (in3)); \
- c1_m = __msa_ilvev_h(k4_m, k3_m); \
- \
- tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
- res1_m = __msa_ilvr_h((in0), (in2)); \
- c1_m = __msa_ilvev_h(k1_m, zero_m); \
- \
- tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
- int3_m += tmp2_m; \
- int3_m += tmp3_m; \
- \
- SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
- PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
- }
-
-#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
- ({ \
- v8i16 c0_m, c1_m; \
- \
- SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
- c0_m = __msa_ilvev_h(c1_m, c0_m); \
- \
- c0_m; \
- })
-
-/* multiply and add macro */
-#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
- out2, out3) \
- { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
- DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
- cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
- SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
- PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \
- DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
- cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \
- SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \
- PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \
- }
-
-/* idct 8x8 macro */
-#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
- v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
- cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
- \
- k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \
- k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \
- k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \
- k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \
- AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
- SUB2(in1, in3, in7, in5, res0_m, res1_m); \
- k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \
- k1_m = __msa_splati_h(mask_m, 4); \
- \
- ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
- DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
- tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- tp4_m = in1 + in3; \
- PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
- tp7_m = in7 + in5; \
- k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
- BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
- BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
- out1, out2, out3, out4, out5, out6, out7); \
- }
-
-#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
- v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
- v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
- cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
- v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
- -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
- v8i16 mask3_m = { \
- -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
- }; \
- \
- k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \
- k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \
- ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
- r1_m, r2_m, r3_m); \
- k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \
- k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \
- ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
- r5_m, r6_m, r7_m); \
- ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
- SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
- k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \
- k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \
- ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
- r1_m, r2_m, r3_m); \
- k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \
- k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \
- ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
- r5_m, r6_m, r7_m); \
- ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
- SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
- ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
- BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
- k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \
- k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \
- ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
- r1_m, r2_m, r3_m); \
- k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \
- DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
- r6_m, r7_m); \
- ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
- SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
- m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
- k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \
- k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \
- ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
- DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
- m1_m, m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
- ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
- DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
- m2_m, m3_m); \
- SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
- \
- out1 = -in1; \
- out3 = -in3; \
- out5 = -in5; \
- out7 = -in7; \
- }
-
-#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
- r12, r13, r14, r15, out0, out1, out2, out3, out4, \
- out5, out6, out7, out8, out9, out10, out11, out12, \
- out13, out14, out15) \
- { \
- v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
- v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
- v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
- v8i16 h8_m, h9_m, h10_m, h11_m; \
- v8i16 k0_m, k1_m, k2_m, k3_m; \
- \
- /* stage 1 */ \
- k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
- k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
- MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
- k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
- k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
- MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
- k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
- k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
- MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
- g11_m); \
- k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
- k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
- MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
- g15_m); \
- \
- /* stage 2 */ \
- k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
- k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
- MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
- h3_m); \
- k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
- k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
- MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
- h6_m, h7_m); \
- BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
- BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
- h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
- \
- /* stage 3 */ \
- BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
- k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
- k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
- k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
- MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
- out7); \
- MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
- out13, out15); \
- \
- /* stage 4 */ \
- k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
- k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
- k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
- k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
- MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
- MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
- MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
- MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
- }
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
- int32_t dst_stride);
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
- int32_t dst_stride);
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
-#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
deleted file mode 100644
index c63b1e857..000000000
--- a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
+++ /dev/null
@@ -1,1190 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct16_rows_dspr2(const int16_t *input, int16_t *output,
- uint32_t no_rows) {
- int i;
- int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
- int step1_10, step1_11, step1_12, step1_13;
- int step2_0, step2_1, step2_2, step2_3;
- int step2_8, step2_9, step2_10, step2_11;
- int step2_12, step2_13, step2_14, step2_15;
- int load1, load2, load3, load4, load5, load6, load7, load8;
- int result1, result2, result3, result4;
- const int const_2_power_13 = 8192;
-
- for (i = no_rows; i--;) {
- /* prefetch row */
- prefetch_load((const uint8_t *)(input + 16));
-
- __asm__ __volatile__(
- "lh %[load1], 0(%[input]) \n\t"
- "lh %[load2], 16(%[input]) \n\t"
- "lh %[load3], 8(%[input]) \n\t"
- "lh %[load4], 24(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "add %[result1], %[load1], %[load2] \n\t"
- "sub %[result2], %[load1], %[load2] \n\t"
- "madd $ac1, %[result1], %[cospi_16_64] \n\t"
- "madd $ac2, %[result2], %[cospi_16_64] \n\t"
- "extp %[step2_0], $ac1, 31 \n\t"
- "extp %[step2_1], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "madd $ac3, %[load3], %[cospi_24_64] \n\t"
- "msub $ac3, %[load4], %[cospi_8_64] \n\t"
- "extp %[step2_2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "madd $ac1, %[load3], %[cospi_8_64] \n\t"
- "madd $ac1, %[load4], %[cospi_24_64] \n\t"
- "extp %[step2_3], $ac1, 31 \n\t"
-
- "add %[step1_0], %[step2_0], %[step2_3] \n\t"
- "add %[step1_1], %[step2_1], %[step2_2] \n\t"
- "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
- "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
- [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
- [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "lh %[load5], 2(%[input]) \n\t"
- "lh %[load6], 30(%[input]) \n\t"
- "lh %[load7], 18(%[input]) \n\t"
- "lh %[load8], 14(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load5], %[cospi_30_64] \n\t"
- "msub $ac1, %[load6], %[cospi_2_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load7], %[cospi_14_64] \n\t"
- "msub $ac3, %[load8], %[cospi_18_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load7], %[cospi_18_64] \n\t"
- "madd $ac1, %[load8], %[cospi_14_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load5], %[cospi_2_64] \n\t"
- "madd $ac2, %[load6], %[cospi_30_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "sub %[load5], %[result1], %[result2] \n\t"
- "sub %[load6], %[result4], %[result3] \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load6], %[cospi_24_64] \n\t"
- "msub $ac1, %[load5], %[cospi_8_64] \n\t"
- "madd $ac3, %[load5], %[cospi_24_64] \n\t"
- "madd $ac3, %[load6], %[cospi_8_64] \n\t"
-
- "extp %[step2_9], $ac1, 31 \n\t"
- "extp %[step2_14], $ac3, 31 \n\t"
- "add %[step2_8], %[result1], %[result2] \n\t"
- "add %[step2_15], %[result4], %[result3] \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
- [load8] "=&r"(load8), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
- [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
- [step2_14] "=r"(step2_14)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
- [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
- __asm__ __volatile__(
- "lh %[load1], 10(%[input]) \n\t"
- "lh %[load2], 22(%[input]) \n\t"
- "lh %[load3], 26(%[input]) \n\t"
- "lh %[load4], 6(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_22_64] \n\t"
- "msub $ac1, %[load2], %[cospi_10_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load3], %[cospi_6_64] \n\t"
- "msub $ac3, %[load4], %[cospi_26_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load1], %[cospi_10_64] \n\t"
- "madd $ac1, %[load2], %[cospi_22_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load3], %[cospi_26_64] \n\t"
- "madd $ac2, %[load4], %[cospi_6_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[result2], %[result1] \n\t"
- "sub %[load2], %[result4], %[result3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_24_64] \n\t"
- "msub $ac1, %[load2], %[cospi_8_64] \n\t"
- "madd $ac3, %[load2], %[cospi_24_64] \n\t"
- "msub $ac3, %[load1], %[cospi_8_64] \n\t"
-
- "extp %[step2_10], $ac1, 31 \n\t"
- "extp %[step2_13], $ac3, 31 \n\t"
- "add %[step2_11], %[result1], %[result2] \n\t"
- "add %[step2_12], %[result4], %[result3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
- [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
- __asm__ __volatile__(
- "lh %[load5], 4(%[input]) \n\t"
- "lh %[load6], 28(%[input]) \n\t"
- "lh %[load7], 20(%[input]) \n\t"
- "lh %[load8], 12(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load5], %[cospi_28_64] \n\t"
- "msub $ac1, %[load6], %[cospi_4_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load7], %[cospi_12_64] \n\t"
- "msub $ac3, %[load8], %[cospi_20_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load7], %[cospi_20_64] \n\t"
- "madd $ac1, %[load8], %[cospi_12_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load5], %[cospi_4_64] \n\t"
- "madd $ac2, %[load6], %[cospi_28_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load5], %[result4], %[result3] \n\t"
- "sub %[load5], %[load5], %[result1] \n\t"
- "add %[load5], %[load5], %[result2] \n\t"
-
- "sub %[load6], %[result1], %[result2] \n\t"
- "sub %[load6], %[load6], %[result3] \n\t"
- "add %[load6], %[load6], %[result4] \n\t"
-
- "madd $ac1, %[load5], %[cospi_16_64] \n\t"
- "madd $ac3, %[load6], %[cospi_16_64] \n\t"
-
- "extp %[step1_5], $ac1, 31 \n\t"
- "extp %[step1_6], $ac3, 31 \n\t"
- "add %[step1_4], %[result1], %[result2] \n\t"
- "add %[step1_7], %[result4], %[result3] \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
- [load8] "=&r"(load8), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- "sub %[load5], %[step2_14], %[step2_13] \n\t"
- "sub %[load5], %[load5], %[step2_9] \n\t"
- "add %[load5], %[load5], %[step2_10] \n\t"
-
- "madd $ac0, %[load5], %[cospi_16_64] \n\t"
-
- "sub %[load6], %[step2_14], %[step2_13] \n\t"
- "sub %[load6], %[load6], %[step2_10] \n\t"
- "add %[load6], %[load6], %[step2_9] \n\t"
-
- "madd $ac1, %[load6], %[cospi_16_64] \n\t"
-
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load5], %[step2_15], %[step2_12] \n\t"
- "sub %[load5], %[load5], %[step2_8] \n\t"
- "add %[load5], %[load5], %[step2_11] \n\t"
-
- "madd $ac2, %[load5], %[cospi_16_64] \n\t"
-
- "sub %[load6], %[step2_15], %[step2_12] \n\t"
- "sub %[load6], %[load6], %[step2_11] \n\t"
- "add %[load6], %[load6], %[step2_8] \n\t"
-
- "madd $ac3, %[load6], %[cospi_16_64] \n\t"
-
- "extp %[step1_10], $ac0, 31 \n\t"
- "extp %[step1_13], $ac1, 31 \n\t"
- "extp %[step1_11], $ac2, 31 \n\t"
- "extp %[step1_12], $ac3, 31 \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
- [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
- [step1_13] "=r"(step1_13)
- : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
- [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
- [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
- [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
- [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "add %[load5], %[step1_0], %[step1_7] \n\t"
- "add %[load5], %[load5], %[step2_12] \n\t"
- "add %[load5], %[load5], %[step2_15] \n\t"
- "add %[load6], %[step1_1], %[step1_6] \n\t"
- "add %[load6], %[load6], %[step2_13] \n\t"
- "add %[load6], %[load6], %[step2_14] \n\t"
- "sh %[load5], 0(%[output]) \n\t"
- "sh %[load6], 32(%[output]) \n\t"
- "sub %[load5], %[step1_1], %[step1_6] \n\t"
- "add %[load5], %[load5], %[step2_9] \n\t"
- "add %[load5], %[load5], %[step2_10] \n\t"
- "sub %[load6], %[step1_0], %[step1_7] \n\t"
- "add %[load6], %[load6], %[step2_8] \n\t"
- "add %[load6], %[load6], %[step2_11] \n\t"
- "sh %[load5], 192(%[output]) \n\t"
- "sh %[load6], 224(%[output]) \n\t"
- "sub %[load5], %[step1_0], %[step1_7] \n\t"
- "sub %[load5], %[load5], %[step2_8] \n\t"
- "sub %[load5], %[load5], %[step2_11] \n\t"
- "sub %[load6], %[step1_1], %[step1_6] \n\t"
- "sub %[load6], %[load6], %[step2_9] \n\t"
- "sub %[load6], %[load6], %[step2_10] \n\t"
- "sh %[load5], 256(%[output]) \n\t"
- "sh %[load6], 288(%[output]) \n\t"
- "add %[load5], %[step1_1], %[step1_6] \n\t"
- "sub %[load5], %[load5], %[step2_13] \n\t"
- "sub %[load5], %[load5], %[step2_14] \n\t"
- "add %[load6], %[step1_0], %[step1_7] \n\t"
- "sub %[load6], %[load6], %[step2_12] \n\t"
- "sub %[load6], %[load6], %[step2_15] \n\t"
- "sh %[load5], 448(%[output]) \n\t"
- "sh %[load6], 480(%[output]) \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6)
- : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
- [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
- [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
- [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
- [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
- [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
-
- __asm__ __volatile__(
- "add %[load5], %[step1_2], %[step1_5] \n\t"
- "add %[load5], %[load5], %[step1_13] \n\t"
- "add %[load6], %[step1_3], %[step1_4] \n\t"
- "add %[load6], %[load6], %[step1_12] \n\t"
- "sh %[load5], 64(%[output]) \n\t"
- "sh %[load6], 96(%[output]) \n\t"
- "sub %[load5], %[step1_3], %[step1_4] \n\t"
- "add %[load5], %[load5], %[step1_11] \n\t"
- "sub %[load6], %[step1_2], %[step1_5] \n\t"
- "add %[load6], %[load6], %[step1_10] \n\t"
- "sh %[load5], 128(%[output]) \n\t"
- "sh %[load6], 160(%[output]) \n\t"
- "sub %[load5], %[step1_2], %[step1_5] \n\t"
- "sub %[load5], %[load5], %[step1_10] \n\t"
- "sub %[load6], %[step1_3], %[step1_4] \n\t"
- "sub %[load6], %[load6], %[step1_11] \n\t"
- "sh %[load5], 320(%[output]) \n\t"
- "sh %[load6], 352(%[output]) \n\t"
- "add %[load5], %[step1_3], %[step1_4] \n\t"
- "sub %[load5], %[load5], %[step1_12] \n\t"
- "add %[load6], %[step1_2], %[step1_5] \n\t"
- "sub %[load6], %[load6], %[step1_13] \n\t"
- "sh %[load5], 384(%[output]) \n\t"
- "sh %[load6], 416(%[output]) \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6)
- : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
- [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
- [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
- [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
-
- input += 16;
- output += 1;
- }
-}
-
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
- int i;
- int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
- int step1_8, step1_9, step1_10, step1_11;
- int step1_12, step1_13, step1_14, step1_15;
- int step2_0, step2_1, step2_2, step2_3;
- int step2_8, step2_9, step2_10, step2_11;
- int step2_12, step2_13, step2_14, step2_15;
- int load1, load2, load3, load4, load5, load6, load7, load8;
- int result1, result2, result3, result4;
- const int const_2_power_13 = 8192;
- uint8_t *dest_pix;
- uint8_t *cm = aom_ff_cropTbl;
-
- /* prefetch aom_ff_cropTbl */
- prefetch_load(aom_ff_cropTbl);
- prefetch_load(aom_ff_cropTbl + 32);
- prefetch_load(aom_ff_cropTbl + 64);
- prefetch_load(aom_ff_cropTbl + 96);
- prefetch_load(aom_ff_cropTbl + 128);
- prefetch_load(aom_ff_cropTbl + 160);
- prefetch_load(aom_ff_cropTbl + 192);
- prefetch_load(aom_ff_cropTbl + 224);
-
- for (i = 0; i < 16; ++i) {
- dest_pix = (dest + i);
- __asm__ __volatile__(
- "lh %[load1], 0(%[input]) \n\t"
- "lh %[load2], 16(%[input]) \n\t"
- "lh %[load3], 8(%[input]) \n\t"
- "lh %[load4], 24(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "add %[result1], %[load1], %[load2] \n\t"
- "sub %[result2], %[load1], %[load2] \n\t"
- "madd $ac1, %[result1], %[cospi_16_64] \n\t"
- "madd $ac2, %[result2], %[cospi_16_64] \n\t"
- "extp %[step2_0], $ac1, 31 \n\t"
- "extp %[step2_1], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "madd $ac3, %[load3], %[cospi_24_64] \n\t"
- "msub $ac3, %[load4], %[cospi_8_64] \n\t"
- "extp %[step2_2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "madd $ac1, %[load3], %[cospi_8_64] \n\t"
- "madd $ac1, %[load4], %[cospi_24_64] \n\t"
- "extp %[step2_3], $ac1, 31 \n\t"
-
- "add %[step1_0], %[step2_0], %[step2_3] \n\t"
- "add %[step1_1], %[step2_1], %[step2_2] \n\t"
- "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
- "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
- [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
- [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "lh %[load5], 2(%[input]) \n\t"
- "lh %[load6], 30(%[input]) \n\t"
- "lh %[load7], 18(%[input]) \n\t"
- "lh %[load8], 14(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load5], %[cospi_30_64] \n\t"
- "msub $ac1, %[load6], %[cospi_2_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load7], %[cospi_14_64] \n\t"
- "msub $ac3, %[load8], %[cospi_18_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load7], %[cospi_18_64] \n\t"
- "madd $ac1, %[load8], %[cospi_14_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load5], %[cospi_2_64] \n\t"
- "madd $ac2, %[load6], %[cospi_30_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "sub %[load5], %[result1], %[result2] \n\t"
- "sub %[load6], %[result4], %[result3] \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load6], %[cospi_24_64] \n\t"
- "msub $ac1, %[load5], %[cospi_8_64] \n\t"
- "madd $ac3, %[load5], %[cospi_24_64] \n\t"
- "madd $ac3, %[load6], %[cospi_8_64] \n\t"
-
- "extp %[step2_9], $ac1, 31 \n\t"
- "extp %[step2_14], $ac3, 31 \n\t"
- "add %[step2_8], %[result1], %[result2] \n\t"
- "add %[step2_15], %[result4], %[result3] \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
- [load8] "=&r"(load8), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
- [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
- [step2_14] "=r"(step2_14)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
- [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
- __asm__ __volatile__(
- "lh %[load1], 10(%[input]) \n\t"
- "lh %[load2], 22(%[input]) \n\t"
- "lh %[load3], 26(%[input]) \n\t"
- "lh %[load4], 6(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_22_64] \n\t"
- "msub $ac1, %[load2], %[cospi_10_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load3], %[cospi_6_64] \n\t"
- "msub $ac3, %[load4], %[cospi_26_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load1], %[cospi_10_64] \n\t"
- "madd $ac1, %[load2], %[cospi_22_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load3], %[cospi_26_64] \n\t"
- "madd $ac2, %[load4], %[cospi_6_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[result2], %[result1] \n\t"
- "sub %[load2], %[result4], %[result3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_24_64] \n\t"
- "msub $ac1, %[load2], %[cospi_8_64] \n\t"
- "madd $ac3, %[load2], %[cospi_24_64] \n\t"
- "msub $ac3, %[load1], %[cospi_8_64] \n\t"
-
- "extp %[step2_10], $ac1, 31 \n\t"
- "extp %[step2_13], $ac3, 31 \n\t"
- "add %[step2_11], %[result1], %[result2] \n\t"
- "add %[step2_12], %[result4], %[result3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
- [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
- __asm__ __volatile__(
- "lh %[load5], 4(%[input]) \n\t"
- "lh %[load6], 28(%[input]) \n\t"
- "lh %[load7], 20(%[input]) \n\t"
- "lh %[load8], 12(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load5], %[cospi_28_64] \n\t"
- "msub $ac1, %[load6], %[cospi_4_64] \n\t"
- "extp %[result1], $ac1, 31 \n\t"
-
- "madd $ac3, %[load7], %[cospi_12_64] \n\t"
- "msub $ac3, %[load8], %[cospi_20_64] \n\t"
- "extp %[result2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac1, %[load7], %[cospi_20_64] \n\t"
- "madd $ac1, %[load8], %[cospi_12_64] \n\t"
- "extp %[result3], $ac1, 31 \n\t"
-
- "madd $ac2, %[load5], %[cospi_4_64] \n\t"
- "madd $ac2, %[load6], %[cospi_28_64] \n\t"
- "extp %[result4], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load5], %[result4], %[result3] \n\t"
- "sub %[load5], %[load5], %[result1] \n\t"
- "add %[load5], %[load5], %[result2] \n\t"
-
- "sub %[load6], %[result1], %[result2] \n\t"
- "sub %[load6], %[load6], %[result3] \n\t"
- "add %[load6], %[load6], %[result4] \n\t"
-
- "madd $ac1, %[load5], %[cospi_16_64] \n\t"
- "madd $ac3, %[load6], %[cospi_16_64] \n\t"
-
- "extp %[step1_5], $ac1, 31 \n\t"
- "extp %[step1_6], $ac3, 31 \n\t"
-
- "add %[step1_4], %[result1], %[result2] \n\t"
- "add %[step1_7], %[result4], %[result3] \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
- [load8] "=&r"(load8), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [result3] "=&r"(result3),
- [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- "sub %[load5], %[step2_14], %[step2_13] \n\t"
- "sub %[load5], %[load5], %[step2_9] \n\t"
- "add %[load5], %[load5], %[step2_10] \n\t"
-
- "madd $ac0, %[load5], %[cospi_16_64] \n\t"
-
- "sub %[load6], %[step2_14], %[step2_13] \n\t"
- "sub %[load6], %[load6], %[step2_10] \n\t"
- "add %[load6], %[load6], %[step2_9] \n\t"
-
- "madd $ac1, %[load6], %[cospi_16_64] \n\t"
-
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load5], %[step2_15], %[step2_12] \n\t"
- "sub %[load5], %[load5], %[step2_8] \n\t"
- "add %[load5], %[load5], %[step2_11] \n\t"
-
- "madd $ac2, %[load5], %[cospi_16_64] \n\t"
-
- "sub %[load6], %[step2_15], %[step2_12] \n\t"
- "sub %[load6], %[load6], %[step2_11] \n\t"
- "add %[load6], %[load6], %[step2_8] \n\t"
-
- "madd $ac3, %[load6], %[cospi_16_64] \n\t"
-
- "extp %[step1_10], $ac0, 31 \n\t"
- "extp %[step1_13], $ac1, 31 \n\t"
- "extp %[step1_11], $ac2, 31 \n\t"
- "extp %[step1_12], $ac3, 31 \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
- [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
- [step1_13] "=r"(step1_13)
- : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
- [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
- [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
- [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
- [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
- step1_8 = step2_8 + step2_11;
- step1_9 = step2_9 + step2_10;
- step1_14 = step2_13 + step2_14;
- step1_15 = step2_12 + step2_15;
-
- __asm__ __volatile__(
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "add %[load5], %[step1_0], %[step1_7] \n\t"
- "add %[load5], %[load5], %[step1_15] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "add %[load6], %[step1_1], %[step1_6] \n\t"
- "add %[load6], %[load6], %[step1_14] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "add %[load5], %[step1_2], %[step1_5] \n\t"
- "add %[load5], %[load5], %[step1_13] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "add %[load6], %[step1_3], %[step1_4] \n\t"
- "add %[load6], %[load6], %[step1_12] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "sub %[load5], %[step1_3], %[step1_4] \n\t"
- "add %[load5], %[load5], %[step1_11] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "sub %[load6], %[step1_2], %[step1_5] \n\t"
- "add %[load6], %[load6], %[step1_10] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "sub %[load5], %[step1_1], %[step1_6] \n\t"
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "add %[load5], %[load5], %[step1_9] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "sub %[load6], %[step1_0], %[step1_7] \n\t"
- "add %[load6], %[load6], %[step1_8] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "sub %[load5], %[step1_0], %[step1_7] \n\t"
- "sub %[load5], %[load5], %[step1_8] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "sub %[load6], %[step1_1], %[step1_6] \n\t"
- "sub %[load6], %[load6], %[step1_9] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "sub %[load5], %[step1_2], %[step1_5] \n\t"
- "sub %[load5], %[load5], %[step1_10] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "sub %[load6], %[step1_3], %[step1_4] \n\t"
- "sub %[load6], %[load6], %[step1_11] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "add %[load5], %[step1_3], %[step1_4] \n\t"
- "sub %[load5], %[load5], %[step1_12] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "add %[load6], %[step1_2], %[step1_5] \n\t"
- "sub %[load6], %[load6], %[step1_13] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[load7], 0(%[dest_pix]) \n\t"
- "add %[load5], %[step1_1], %[step1_6] \n\t"
- "sub %[load5], %[load5], %[step1_14] \n\t"
- "addi %[load5], %[load5], 32 \n\t"
- "sra %[load5], %[load5], 6 \n\t"
- "add %[load7], %[load7], %[load5] \n\t"
- "lbux %[load5], %[load7](%[cm]) \n\t"
- "add %[load6], %[step1_0], %[step1_7] \n\t"
- "sub %[load6], %[load6], %[step1_15] \n\t"
- "sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[load8], 0(%[dest_pix]) \n\t"
- "addi %[load6], %[load6], 32 \n\t"
- "sra %[load6], %[load6], 6 \n\t"
- "add %[load8], %[load8], %[load6] \n\t"
- "lbux %[load6], %[load8](%[cm]) \n\t"
- "sb %[load6], 0(%[dest_pix]) \n\t"
-
- : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
- [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
- :
- [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
- [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
- [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
- [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
- [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
- [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
- [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
-
- input += 16;
- }
-}
-
-void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
-
- // First transform rows
- idct16_rows_dspr2(input, out, 16);
-
- // Then transform columns and add to dest
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
- int16_t *outptr = out;
- uint32_t i;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- idct16_rows_dspr2(input, outptr, 4);
-
- outptr += 4;
- for (i = 0; i < 6; ++i) {
- __asm__ __volatile__(
- "sw $zero, 0(%[outptr]) \n\t"
- "sw $zero, 32(%[outptr]) \n\t"
- "sw $zero, 64(%[outptr]) \n\t"
- "sw $zero, 96(%[outptr]) \n\t"
- "sw $zero, 128(%[outptr]) \n\t"
- "sw $zero, 160(%[outptr]) \n\t"
- "sw $zero, 192(%[outptr]) \n\t"
- "sw $zero, 224(%[outptr]) \n\t"
- "sw $zero, 256(%[outptr]) \n\t"
- "sw $zero, 288(%[outptr]) \n\t"
- "sw $zero, 320(%[outptr]) \n\t"
- "sw $zero, 352(%[outptr]) \n\t"
- "sw $zero, 384(%[outptr]) \n\t"
- "sw $zero, 416(%[outptr]) \n\t"
- "sw $zero, 448(%[outptr]) \n\t"
- "sw $zero, 480(%[outptr]) \n\t"
-
- :
- : [outptr] "r"(outptr));
-
- outptr += 2;
- }
-
- // Then transform columns
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- uint32_t pos = 45;
- int32_t out;
- int32_t r;
- int32_t a1, absa1;
- int32_t vector_a1;
- int32_t t1, t2, t3, t4;
- int32_t vector_1, vector_2, vector_3, vector_4;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
-
- :
- : [pos] "r"(pos));
-
- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__(
- "addi %[out], %[out], 32 \n\t"
- "sra %[a1], %[out], 6 \n\t"
-
- : [out] "+r"(out), [a1] "=r"(a1)
- :);
-
- if (a1 < 0) {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__(
- "abs %[absa1], %[a1] \n\t"
- "replv.qb %[vector_a1], %[absa1] \n\t"
-
- : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 16; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "lw %[t3], 8(%[dest]) \n\t"
- "lw %[t4], 12(%[dest]) \n\t"
- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "sw %[vector_3], 8(%[dest]) \n\t"
- "sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
- [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
- [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
- [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- } else {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
-
- : [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 16; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "lw %[t3], 8(%[dest]) \n\t"
- "lw %[t4], 12(%[dest]) \n\t"
- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "sw %[vector_3], 8(%[dest]) \n\t"
- "sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
- [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
- [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
- [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- }
-}
-
-void iadst16_dspr2(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
- int x0 = input[15];
- int x1 = input[0];
- int x2 = input[13];
- int x3 = input[2];
- int x4 = input[11];
- int x5 = input[4];
- int x6 = input[9];
- int x7 = input[6];
- int x8 = input[7];
- int x9 = input[8];
- int x10 = input[5];
- int x11 = input[10];
- int x12 = input[3];
- int x13 = input[12];
- int x14 = input[1];
- int x15 = input[14];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = output[8] = output[9] = output[10] =
- output[11] = output[12] = output[13] = output[14] = output[15] = 0;
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = dct_const_round_shift(s0 + s8);
- x1 = dct_const_round_shift(s1 + s9);
- x2 = dct_const_round_shift(s2 + s10);
- x3 = dct_const_round_shift(s3 + s11);
- x4 = dct_const_round_shift(s4 + s12);
- x5 = dct_const_round_shift(s5 + s13);
- x6 = dct_const_round_shift(s6 + s14);
- x7 = dct_const_round_shift(s7 + s15);
- x8 = dct_const_round_shift(s0 - s8);
- x9 = dct_const_round_shift(s1 - s9);
- x10 = dct_const_round_shift(s2 - s10);
- x11 = dct_const_round_shift(s3 - s11);
- x12 = dct_const_round_shift(s4 - s12);
- x13 = dct_const_round_shift(s5 - s13);
- x14 = dct_const_round_shift(s6 - s14);
- x15 = dct_const_round_shift(s7 - s15);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = s0 + s4;
- x1 = s1 + s5;
- x2 = s2 + s6;
- x3 = s3 + s7;
- x4 = s0 - s4;
- x5 = s1 - s5;
- x6 = s2 - s6;
- x7 = s3 - s7;
- x8 = dct_const_round_shift(s8 + s12);
- x9 = dct_const_round_shift(s9 + s13);
- x10 = dct_const_round_shift(s10 + s14);
- x11 = dct_const_round_shift(s11 + s15);
- x12 = dct_const_round_shift(s8 - s12);
- x13 = dct_const_round_shift(s9 - s13);
- x14 = dct_const_round_shift(s10 - s14);
- x15 = dct_const_round_shift(s11 - s15);
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = s0 + s2;
- x1 = s1 + s3;
- x2 = s0 - s2;
- x3 = s1 - s3;
- x4 = dct_const_round_shift(s4 + s6);
- x5 = dct_const_round_shift(s5 + s7);
- x6 = dct_const_round_shift(s4 - s6);
- x7 = dct_const_round_shift(s5 - s7);
- x8 = s8 + s10;
- x9 = s9 + s11;
- x10 = s8 - s10;
- x11 = s9 - s11;
- x12 = dct_const_round_shift(s12 + s14);
- x13 = dct_const_round_shift(s13 + s15);
- x14 = dct_const_round_shift(s12 - s14);
- x15 = dct_const_round_shift(s13 - s15);
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = dct_const_round_shift(s2);
- x3 = dct_const_round_shift(s3);
- x6 = dct_const_round_shift(s6);
- x7 = dct_const_round_shift(s7);
- x10 = dct_const_round_shift(s10);
- x11 = dct_const_round_shift(s11);
- x14 = dct_const_round_shift(s14);
- x15 = dct_const_round_shift(s15);
-
- output[0] = x0;
- output[1] = -x8;
- output[2] = x12;
- output[3] = -x4;
- output[4] = x6;
- output[5] = x14;
- output[6] = x10;
- output[7] = x2;
- output[8] = x3;
- output[9] = x11;
- output[10] = x15;
- output[11] = x7;
- output[12] = x5;
- output[13] = -x13;
- output[14] = x9;
- output[15] = -x1;
-}
-
-#endif // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
deleted file mode 100644
index d469d1ad0..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
+++ /dev/null
@@ -1,1042 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
- int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
- int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
- int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
- int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
- int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
- int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
- int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
- int16_t step2_28, step2_29, step2_30, step2_31;
- int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
- int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
- int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
- int16_t step3_28, step3_29, step3_30, step3_31;
- int temp0, temp1, temp2, temp3;
- int load1, load2, load3, load4;
- int result1, result2;
- int i, temp21;
- uint8_t *dest_pix, *dest_pix1;
- const int const_2_power_13 = 8192;
- uint8_t *cm = aom_ff_cropTbl;
-
- /* prefetch aom_ff_cropTbl */
- prefetch_load(aom_ff_cropTbl);
- prefetch_load(aom_ff_cropTbl + 32);
- prefetch_load(aom_ff_cropTbl + 64);
- prefetch_load(aom_ff_cropTbl + 96);
- prefetch_load(aom_ff_cropTbl + 128);
- prefetch_load(aom_ff_cropTbl + 160);
- prefetch_load(aom_ff_cropTbl + 192);
- prefetch_load(aom_ff_cropTbl + 224);
-
- for (i = 0; i < 32; ++i) {
- dest_pix = dest + i;
- dest_pix1 = dest + i + 31 * dest_stride;
-
- __asm__ __volatile__(
- "lh %[load1], 2(%[input]) \n\t"
- "lh %[load2], 62(%[input]) \n\t"
- "lh %[load3], 34(%[input]) \n\t"
- "lh %[load4], 30(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_31_64] \n\t"
- "msub $ac1, %[load2], %[cospi_1_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_1_64] \n\t"
- "madd $ac3, %[load2], %[cospi_31_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_15_64] \n\t"
- "msub $ac2, %[load4], %[cospi_17_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_17_64] \n\t"
- "madd $ac1, %[load4], %[cospi_15_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp3], %[temp2] \n\t"
- "sub %[load2], %[temp0], %[temp1] \n\t"
-
- "madd $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "madd $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
-
- "extp %[step1_17], $ac1, 31 \n\t"
- "extp %[step1_30], $ac3, 31 \n\t"
- "add %[step1_16], %[temp0], %[temp1] \n\t"
- "add %[step1_31], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
- [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
- [step1_31] "=r"(step1_31)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
- [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
- __asm__ __volatile__(
- "lh %[load1], 18(%[input]) \n\t"
- "lh %[load2], 46(%[input]) \n\t"
- "lh %[load3], 50(%[input]) \n\t"
- "lh %[load4], 14(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_23_64] \n\t"
- "msub $ac1, %[load2], %[cospi_9_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_9_64] \n\t"
- "madd $ac3, %[load2], %[cospi_23_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_7_64] \n\t"
- "msub $ac2, %[load4], %[cospi_25_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_25_64] \n\t"
- "madd $ac1, %[load4], %[cospi_7_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "msub $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
-
- "extp %[step1_18], $ac1, 31 \n\t"
- "extp %[step1_29], $ac3, 31 \n\t"
- "add %[step1_19], %[temp0], %[temp1] \n\t"
- "add %[step1_28], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
- [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
- [step1_29] "=r"(step1_29)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
- [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
- __asm__ __volatile__(
- "lh %[load1], 10(%[input]) \n\t"
- "lh %[load2], 54(%[input]) \n\t"
- "lh %[load3], 42(%[input]) \n\t"
- "lh %[load4], 22(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_27_64] \n\t"
- "msub $ac1, %[load2], %[cospi_5_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_5_64] \n\t"
- "madd $ac3, %[load2], %[cospi_27_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_11_64] \n\t"
- "msub $ac2, %[load4], %[cospi_21_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_21_64] \n\t"
- "madd $ac1, %[load4], %[cospi_11_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[temp3], %[temp2] \n\t"
-
- "madd $ac1, %[load2], %[cospi_12_64] \n\t"
- "msub $ac1, %[load1], %[cospi_20_64] \n\t"
- "madd $ac3, %[load1], %[cospi_12_64] \n\t"
- "madd $ac3, %[load2], %[cospi_20_64] \n\t"
-
- "extp %[step1_21], $ac1, 31 \n\t"
- "extp %[step1_26], $ac3, 31 \n\t"
- "add %[step1_20], %[temp0], %[temp1] \n\t"
- "add %[step1_27], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
- [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
- [step1_27] "=r"(step1_27)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
- [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
- [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
- __asm__ __volatile__(
- "lh %[load1], 26(%[input]) \n\t"
- "lh %[load2], 38(%[input]) \n\t"
- "lh %[load3], 58(%[input]) \n\t"
- "lh %[load4], 6(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_19_64] \n\t"
- "msub $ac1, %[load2], %[cospi_13_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "madd $ac3, %[load1], %[cospi_13_64] \n\t"
- "madd $ac3, %[load2], %[cospi_19_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_3_64] \n\t"
- "msub $ac2, %[load4], %[cospi_29_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
- "madd $ac1, %[load3], %[cospi_29_64] \n\t"
- "madd $ac1, %[load4], %[cospi_3_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
- "msub $ac1, %[load1], %[cospi_12_64] \n\t"
- "msub $ac1, %[load2], %[cospi_20_64] \n\t"
- "msub $ac3, %[load1], %[cospi_20_64] \n\t"
- "madd $ac3, %[load2], %[cospi_12_64] \n\t"
- "extp %[step1_22], $ac1, 31 \n\t"
- "extp %[step1_25], $ac3, 31 \n\t"
- "add %[step1_23], %[temp0], %[temp1] \n\t"
- "add %[step1_24], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
- [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
- [step1_25] "=r"(step1_25)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
- [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
- [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
- __asm__ __volatile__(
- "lh %[load1], 4(%[input]) \n\t"
- "lh %[load2], 60(%[input]) \n\t"
- "lh %[load3], 36(%[input]) \n\t"
- "lh %[load4], 28(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_30_64] \n\t"
- "msub $ac1, %[load2], %[cospi_2_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "madd $ac3, %[load1], %[cospi_2_64] \n\t"
- "madd $ac3, %[load2], %[cospi_30_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_14_64] \n\t"
- "msub $ac2, %[load4], %[cospi_18_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
- "madd $ac1, %[load3], %[cospi_18_64] \n\t"
- "madd $ac1, %[load4], %[cospi_14_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[temp3], %[temp2] \n\t"
- "msub $ac1, %[load1], %[cospi_8_64] \n\t"
- "madd $ac1, %[load2], %[cospi_24_64] \n\t"
- "madd $ac3, %[load1], %[cospi_24_64] \n\t"
- "madd $ac3, %[load2], %[cospi_8_64] \n\t"
- "extp %[step2_9], $ac1, 31 \n\t"
- "extp %[step2_14], $ac3, 31 \n\t"
- "add %[step2_8], %[temp0], %[temp1] \n\t"
- "add %[step2_15], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
- [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
- [step2_15] "=r"(step2_15)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
- [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
- [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
- __asm__ __volatile__(
- "lh %[load1], 20(%[input]) \n\t"
- "lh %[load2], 44(%[input]) \n\t"
- "lh %[load3], 52(%[input]) \n\t"
- "lh %[load4], 12(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_22_64] \n\t"
- "msub $ac1, %[load2], %[cospi_10_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "madd $ac3, %[load1], %[cospi_10_64] \n\t"
- "madd $ac3, %[load2], %[cospi_22_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_6_64] \n\t"
- "msub $ac2, %[load4], %[cospi_26_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
- "madd $ac1, %[load3], %[cospi_26_64] \n\t"
- "madd $ac1, %[load4], %[cospi_6_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
- "msub $ac1, %[load1], %[cospi_24_64] \n\t"
- "msub $ac1, %[load2], %[cospi_8_64] \n\t"
- "madd $ac3, %[load2], %[cospi_24_64] \n\t"
- "msub $ac3, %[load1], %[cospi_8_64] \n\t"
- "extp %[step2_10], $ac1, 31 \n\t"
- "extp %[step2_13], $ac3, 31 \n\t"
- "add %[step2_11], %[temp0], %[temp1] \n\t"
- "add %[step2_12], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
- [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
- [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "sub %[temp0], %[step2_14], %[step2_13] \n\t"
- "sub %[temp0], %[temp0], %[step2_9] \n\t"
- "add %[temp0], %[temp0], %[step2_10] \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sub %[temp1], %[step2_14], %[step2_13] \n\t"
- "add %[temp1], %[temp1], %[step2_9] \n\t"
- "sub %[temp1], %[temp1], %[step2_10] \n\t"
- "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "sub %[temp0], %[step2_15], %[step2_12] \n\t"
- "sub %[temp0], %[temp0], %[step2_8] \n\t"
- "add %[temp0], %[temp0], %[step2_11] \n\t"
- "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sub %[temp1], %[step2_15], %[step2_12] \n\t"
- "add %[temp1], %[temp1], %[step2_8] \n\t"
- "sub %[temp1], %[temp1], %[step2_11] \n\t"
- "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
-
- "add %[step3_8], %[step2_8], %[step2_11] \n\t"
- "add %[step3_9], %[step2_9], %[step2_10] \n\t"
- "add %[step3_14], %[step2_13], %[step2_14] \n\t"
- "add %[step3_15], %[step2_12], %[step2_15] \n\t"
- "extp %[step3_10], $ac0, 31 \n\t"
- "extp %[step3_13], $ac1, 31 \n\t"
- "extp %[step3_11], $ac2, 31 \n\t"
- "extp %[step3_12], $ac3, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
- [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
- [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
- [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
- [step3_15] "=r"(step3_15)
- : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
- [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
- [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
- [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
- [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
- step2_18 = step1_17 - step1_18;
- step2_29 = step1_30 - step1_29;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
- "extp %[step3_18], $ac0, 31 \n\t"
-
- : [step3_18] "=r"(step3_18)
- : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
- [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
- step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_19 = step1_16 - step1_19;
- step2_28 = step1_31 - step1_28;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
- "extp %[step3_19], $ac0, 31 \n\t"
-
- : [step3_19] "=r"(step3_19)
- : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
- [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
- step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_16 = step1_16 + step1_19;
- step3_17 = step1_17 + step1_18;
- step3_30 = step1_29 + step1_30;
- step3_31 = step1_28 + step1_31;
-
- step2_20 = step1_23 - step1_20;
- step2_27 = step1_24 - step1_27;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
- "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
- "extp %[step3_20], $ac0, 31 \n\t"
-
- : [step3_20] "=r"(step3_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
- step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_21 = step1_22 - step1_21;
- step2_26 = step1_25 - step1_26;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
- "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
- "extp %[step3_21], $ac1, 31 \n\t"
-
- : [step3_21] "=r"(step3_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
- [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
- step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_22 = step1_21 + step1_22;
- step3_23 = step1_20 + step1_23;
- step3_24 = step1_24 + step1_27;
- step3_25 = step1_25 + step1_26;
-
- step2_16 = step3_16 + step3_23;
- step2_17 = step3_17 + step3_22;
- step2_18 = step3_18 + step3_21;
- step2_19 = step3_19 + step3_20;
- step2_20 = step3_19 - step3_20;
- step2_21 = step3_18 - step3_21;
- step2_22 = step3_17 - step3_22;
- step2_23 = step3_16 - step3_23;
-
- step2_24 = step3_31 - step3_24;
- step2_25 = step3_30 - step3_25;
- step2_26 = step3_29 - step3_26;
- step2_27 = step3_28 - step3_27;
- step2_28 = step3_28 + step3_27;
- step2_29 = step3_29 + step3_26;
- step2_30 = step3_30 + step3_25;
- step2_31 = step3_31 + step3_24;
-
- __asm__ __volatile__(
- "lh %[load1], 0(%[input]) \n\t"
- "lh %[load2], 32(%[input]) \n\t"
- "lh %[load3], 16(%[input]) \n\t"
- "lh %[load4], 48(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "add %[result1], %[load1], %[load2] \n\t"
- "sub %[result2], %[load1], %[load2] \n\t"
- "madd $ac1, %[result1], %[cospi_16_64] \n\t"
- "madd $ac2, %[result2], %[cospi_16_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "madd $ac3, %[load3], %[cospi_24_64] \n\t"
- "msub $ac3, %[load4], %[cospi_8_64] \n\t"
- "extp %[temp2], $ac3, 31 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "madd $ac1, %[load3], %[cospi_8_64] \n\t"
- "madd $ac1, %[load4], %[cospi_24_64] \n\t"
- "extp %[temp3], $ac1, 31 \n\t"
- "add %[step1_0], %[temp0], %[temp3] \n\t"
- "add %[step1_1], %[temp1], %[temp2] \n\t"
- "sub %[step1_2], %[temp1], %[temp2] \n\t"
- "sub %[step1_3], %[temp0], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- __asm__ __volatile__(
- "lh %[load1], 8(%[input]) \n\t"
- "lh %[load2], 56(%[input]) \n\t"
- "lh %[load3], 40(%[input]) \n\t"
- "lh %[load4], 24(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "madd $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_12_64] \n\t"
- "msub $ac2, %[load4], %[cospi_20_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
- "madd $ac1, %[load3], %[cospi_20_64] \n\t"
- "madd $ac1, %[load4], %[cospi_12_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp3], %[temp2] \n\t"
- "sub %[load1], %[load1], %[temp0] \n\t"
- "add %[load1], %[load1], %[temp1] \n\t"
- "sub %[load2], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[load2], %[temp2] \n\t"
- "add %[load2], %[load2], %[temp3] \n\t"
- "madd $ac1, %[load1], %[cospi_16_64] \n\t"
- "madd $ac3, %[load2], %[cospi_16_64] \n\t"
-
- "extp %[step1_5], $ac1, 31 \n\t"
- "extp %[step1_6], $ac3, 31 \n\t"
- "add %[step1_4], %[temp0], %[temp1] \n\t"
- "add %[step1_7], %[temp3], %[temp2] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- step2_0 = step1_0 + step1_7;
- step2_1 = step1_1 + step1_6;
- step2_2 = step1_2 + step1_5;
- step2_3 = step1_3 + step1_4;
- step2_4 = step1_3 - step1_4;
- step2_5 = step1_2 - step1_5;
- step2_6 = step1_1 - step1_6;
- step2_7 = step1_0 - step1_7;
-
- // stage 7
- step1_0 = step2_0 + step3_15;
- step1_1 = step2_1 + step3_14;
- step1_2 = step2_2 + step3_13;
- step1_3 = step2_3 + step3_12;
- step1_4 = step2_4 + step3_11;
- step1_5 = step2_5 + step3_10;
- step1_6 = step2_6 + step3_9;
- step1_7 = step2_7 + step3_8;
- step1_8 = step2_7 - step3_8;
- step1_9 = step2_6 - step3_9;
- step1_10 = step2_5 - step3_10;
- step1_11 = step2_4 - step3_11;
- step1_12 = step2_3 - step3_12;
- step1_13 = step2_2 - step3_13;
- step1_14 = step2_1 - step3_14;
- step1_15 = step2_0 - step3_15;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_27], %[step2_20] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_20], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_20 + step2_27) * cospi_16_64;
- step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_26], %[step2_21] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_21], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
- [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_21 + step2_26) * cospi_16_64;
- step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_25], %[step2_22] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_22], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
- : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
- [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_22 + step2_25) * cospi_16_64;
- step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_24], %[step2_23] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_23], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
- : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
- [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_23 + step2_24) * cospi_16_64;
- step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_0], %[step2_31] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_1], %[step2_30] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_2], %[step2_29] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_3], %[step2_28] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
- [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
- [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
- [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
- [step2_31] "r"(step2_31));
-
- step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
- step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
- step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
- step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_15] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_14] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_13] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_12] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_4], %[step1_27] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_5], %[step1_26] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_6], %[step1_25] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_7], %[step1_24] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
- [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
- [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
- [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
- [step1_27] "r"(step1_27));
-
- step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
- step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
- step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
- step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_15] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_14] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_13] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_12] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_8], %[step1_23] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_9], %[step1_22] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_10], %[step1_21] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_11], %[step1_20] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
- [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
- [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
- [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
- [step1_23] "r"(step1_23));
-
- step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
- step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
- step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
- step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_15] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_14] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_13] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_12] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_12], %[step2_19] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_13], %[step2_18] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix]) \n\t"
- "add %[temp0], %[step1_14], %[step2_17] \n\t"
- "addi %[temp0], %[temp0], 32 \n\t"
- "sra %[temp0], %[temp0], 6 \n\t"
- "add %[temp2], %[temp2], %[temp0] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "add %[temp1], %[step1_15], %[step2_16] \n\t"
- "sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix]) \n\t"
- "addi %[temp1], %[temp1], 32 \n\t"
- "sra %[temp1], %[temp1], 6 \n\t"
- "add %[temp3], %[temp3], %[temp1] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix]) \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
- [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
- [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
- [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
-
- step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
- step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
- step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
- step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
-
- __asm__ __volatile__(
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_15] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_14] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
-
- "lbu %[temp2], 0(%[dest_pix1]) \n\t"
- "add %[temp2], %[temp2], %[step3_13] \n\t"
- "lbux %[temp0], %[temp2](%[cm]) \n\t"
- "sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
- "lbu %[temp3], 0(%[dest_pix1]) \n\t"
- "add %[temp3], %[temp3], %[step3_12] \n\t"
- "lbux %[temp1], %[temp3](%[cm]) \n\t"
- "sb %[temp1], 0(%[dest_pix1]) \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
- [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
- input += 32;
- }
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
deleted file mode 100644
index fa7703217..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
+++ /dev/null
@@ -1,1030 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
- uint32_t no_rows) {
- int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
- int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
- int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
- int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
- int16_t step1_28, step1_29, step1_30, step1_31;
- int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
- int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
- int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
- int16_t step2_28, step2_29, step2_30, step2_31;
- int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
- int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
- int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
- int16_t step3_29, step3_30, step3_31;
- int temp0, temp1, temp2, temp3;
- int load1, load2, load3, load4;
- int result1, result2;
- int temp21;
- int i;
- const int const_2_power_13 = 8192;
- const int32_t *input_int;
-
- for (i = no_rows; i--;) {
- input_int = (const int32_t *)input;
-
- if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
- input_int[4] | input_int[5] | input_int[6] | input_int[7] |
- input_int[8] | input_int[9] | input_int[10] | input_int[11] |
- input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
- input += 32;
-
- __asm__ __volatile__(
- "sh $zero, 0(%[output]) \n\t"
- "sh $zero, 64(%[output]) \n\t"
- "sh $zero, 128(%[output]) \n\t"
- "sh $zero, 192(%[output]) \n\t"
- "sh $zero, 256(%[output]) \n\t"
- "sh $zero, 320(%[output]) \n\t"
- "sh $zero, 384(%[output]) \n\t"
- "sh $zero, 448(%[output]) \n\t"
- "sh $zero, 512(%[output]) \n\t"
- "sh $zero, 576(%[output]) \n\t"
- "sh $zero, 640(%[output]) \n\t"
- "sh $zero, 704(%[output]) \n\t"
- "sh $zero, 768(%[output]) \n\t"
- "sh $zero, 832(%[output]) \n\t"
- "sh $zero, 896(%[output]) \n\t"
- "sh $zero, 960(%[output]) \n\t"
- "sh $zero, 1024(%[output]) \n\t"
- "sh $zero, 1088(%[output]) \n\t"
- "sh $zero, 1152(%[output]) \n\t"
- "sh $zero, 1216(%[output]) \n\t"
- "sh $zero, 1280(%[output]) \n\t"
- "sh $zero, 1344(%[output]) \n\t"
- "sh $zero, 1408(%[output]) \n\t"
- "sh $zero, 1472(%[output]) \n\t"
- "sh $zero, 1536(%[output]) \n\t"
- "sh $zero, 1600(%[output]) \n\t"
- "sh $zero, 1664(%[output]) \n\t"
- "sh $zero, 1728(%[output]) \n\t"
- "sh $zero, 1792(%[output]) \n\t"
- "sh $zero, 1856(%[output]) \n\t"
- "sh $zero, 1920(%[output]) \n\t"
- "sh $zero, 1984(%[output]) \n\t"
-
- :
- : [output] "r"(output));
-
- output += 1;
-
- continue;
- }
-
- /* prefetch row */
- prefetch_load((const uint8_t *)(input + 32));
- prefetch_load((const uint8_t *)(input + 48));
-
- __asm__ __volatile__(
- "lh %[load1], 2(%[input]) \n\t"
- "lh %[load2], 62(%[input]) \n\t"
- "lh %[load3], 34(%[input]) \n\t"
- "lh %[load4], 30(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_31_64] \n\t"
- "msub $ac1, %[load2], %[cospi_1_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_1_64] \n\t"
- "madd $ac3, %[load2], %[cospi_31_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_15_64] \n\t"
- "msub $ac2, %[load4], %[cospi_17_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_17_64] \n\t"
- "madd $ac1, %[load4], %[cospi_15_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp3], %[temp2] \n\t"
- "sub %[load2], %[temp0], %[temp1] \n\t"
-
- "madd $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "madd $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
-
- "extp %[step1_17], $ac1, 31 \n\t"
- "extp %[step1_30], $ac3, 31 \n\t"
- "add %[step1_16], %[temp0], %[temp1] \n\t"
- "add %[step1_31], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
- [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
- [step1_31] "=r"(step1_31)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
- [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
- __asm__ __volatile__(
- "lh %[load1], 18(%[input]) \n\t"
- "lh %[load2], 46(%[input]) \n\t"
- "lh %[load3], 50(%[input]) \n\t"
- "lh %[load4], 14(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_23_64] \n\t"
- "msub $ac1, %[load2], %[cospi_9_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_9_64] \n\t"
- "madd $ac3, %[load2], %[cospi_23_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_7_64] \n\t"
- "msub $ac2, %[load4], %[cospi_25_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_25_64] \n\t"
- "madd $ac1, %[load4], %[cospi_7_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "msub $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
-
- "extp %[step1_18], $ac1, 31 \n\t"
- "extp %[step1_29], $ac3, 31 \n\t"
- "add %[step1_19], %[temp0], %[temp1] \n\t"
- "add %[step1_28], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
- [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
- [step1_29] "=r"(step1_29)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
- [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
- __asm__ __volatile__(
- "lh %[load1], 10(%[input]) \n\t"
- "lh %[load2], 54(%[input]) \n\t"
- "lh %[load3], 42(%[input]) \n\t"
- "lh %[load4], 22(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_27_64] \n\t"
- "msub $ac1, %[load2], %[cospi_5_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_5_64] \n\t"
- "madd $ac3, %[load2], %[cospi_27_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_11_64] \n\t"
- "msub $ac2, %[load4], %[cospi_21_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_21_64] \n\t"
- "madd $ac1, %[load4], %[cospi_11_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[temp3], %[temp2] \n\t"
-
- "madd $ac1, %[load2], %[cospi_12_64] \n\t"
- "msub $ac1, %[load1], %[cospi_20_64] \n\t"
- "madd $ac3, %[load1], %[cospi_12_64] \n\t"
- "madd $ac3, %[load2], %[cospi_20_64] \n\t"
-
- "extp %[step1_21], $ac1, 31 \n\t"
- "extp %[step1_26], $ac3, 31 \n\t"
- "add %[step1_20], %[temp0], %[temp1] \n\t"
- "add %[step1_27], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
- [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
- [step1_27] "=r"(step1_27)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
- [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
- [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
- __asm__ __volatile__(
- "lh %[load1], 26(%[input]) \n\t"
- "lh %[load2], 38(%[input]) \n\t"
- "lh %[load3], 58(%[input]) \n\t"
- "lh %[load4], 6(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_19_64] \n\t"
- "msub $ac1, %[load2], %[cospi_13_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_13_64] \n\t"
- "madd $ac3, %[load2], %[cospi_19_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_3_64] \n\t"
- "msub $ac2, %[load4], %[cospi_29_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_29_64] \n\t"
- "madd $ac1, %[load4], %[cospi_3_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_12_64] \n\t"
- "msub $ac1, %[load2], %[cospi_20_64] \n\t"
- "msub $ac3, %[load1], %[cospi_20_64] \n\t"
- "madd $ac3, %[load2], %[cospi_12_64] \n\t"
-
- "extp %[step1_22], $ac1, 31 \n\t"
- "extp %[step1_25], $ac3, 31 \n\t"
- "add %[step1_23], %[temp0], %[temp1] \n\t"
- "add %[step1_24], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
- [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
- [step1_25] "=r"(step1_25)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
- [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
- [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
- __asm__ __volatile__(
- "lh %[load1], 4(%[input]) \n\t"
- "lh %[load2], 60(%[input]) \n\t"
- "lh %[load3], 36(%[input]) \n\t"
- "lh %[load4], 28(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_30_64] \n\t"
- "msub $ac1, %[load2], %[cospi_2_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_2_64] \n\t"
- "madd $ac3, %[load2], %[cospi_30_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_14_64] \n\t"
- "msub $ac2, %[load4], %[cospi_18_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_18_64] \n\t"
- "madd $ac1, %[load4], %[cospi_14_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[temp3], %[temp2] \n\t"
-
- "msub $ac1, %[load1], %[cospi_8_64] \n\t"
- "madd $ac1, %[load2], %[cospi_24_64] \n\t"
- "madd $ac3, %[load1], %[cospi_24_64] \n\t"
- "madd $ac3, %[load2], %[cospi_8_64] \n\t"
-
- "extp %[step2_9], $ac1, 31 \n\t"
- "extp %[step2_14], $ac3, 31 \n\t"
- "add %[step2_8], %[temp0], %[temp1] \n\t"
- "add %[step2_15], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
- [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
- [step2_15] "=r"(step2_15)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
- [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
- [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
- __asm__ __volatile__(
- "lh %[load1], 20(%[input]) \n\t"
- "lh %[load2], 44(%[input]) \n\t"
- "lh %[load3], 52(%[input]) \n\t"
- "lh %[load4], 12(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_22_64] \n\t"
- "msub $ac1, %[load2], %[cospi_10_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_10_64] \n\t"
- "madd $ac3, %[load2], %[cospi_22_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_6_64] \n\t"
- "msub $ac2, %[load4], %[cospi_26_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_26_64] \n\t"
- "madd $ac1, %[load4], %[cospi_6_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp1], %[temp0] \n\t"
- "sub %[load2], %[temp2], %[temp3] \n\t"
-
- "msub $ac1, %[load1], %[cospi_24_64] \n\t"
- "msub $ac1, %[load2], %[cospi_8_64] \n\t"
- "madd $ac3, %[load2], %[cospi_24_64] \n\t"
- "msub $ac3, %[load1], %[cospi_8_64] \n\t"
-
- "extp %[step2_10], $ac1, 31 \n\t"
- "extp %[step2_13], $ac3, 31 \n\t"
- "add %[step2_11], %[temp0], %[temp1] \n\t"
- "add %[step2_12], %[temp2], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
- [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
- [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "sub %[temp0], %[step2_14], %[step2_13] \n\t"
- "sub %[temp0], %[temp0], %[step2_9] \n\t"
- "add %[temp0], %[temp0], %[step2_10] \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sub %[temp1], %[step2_14], %[step2_13] \n\t"
- "add %[temp1], %[temp1], %[step2_9] \n\t"
- "sub %[temp1], %[temp1], %[step2_10] \n\t"
- "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
-
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "sub %[temp0], %[step2_15], %[step2_12] \n\t"
- "sub %[temp0], %[temp0], %[step2_8] \n\t"
- "add %[temp0], %[temp0], %[step2_11] \n\t"
- "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
-
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sub %[temp1], %[step2_15], %[step2_12] \n\t"
- "add %[temp1], %[temp1], %[step2_8] \n\t"
- "sub %[temp1], %[temp1], %[step2_11] \n\t"
- "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
-
- "add %[step3_8], %[step2_8], %[step2_11] \n\t"
- "add %[step3_9], %[step2_9], %[step2_10] \n\t"
- "add %[step3_14], %[step2_13], %[step2_14] \n\t"
- "add %[step3_15], %[step2_12], %[step2_15] \n\t"
-
- "extp %[step3_10], $ac0, 31 \n\t"
- "extp %[step3_13], $ac1, 31 \n\t"
- "extp %[step3_11], $ac2, 31 \n\t"
- "extp %[step3_12], $ac3, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
- [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
- [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
- [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
- [step3_15] "=r"(step3_15)
- : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
- [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
- [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
- [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
- [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
- step2_18 = step1_17 - step1_18;
- step2_29 = step1_30 - step1_29;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
- "extp %[step3_18], $ac0, 31 \n\t"
-
- : [step3_18] "=r"(step3_18)
- : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
- [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
- step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_19 = step1_16 - step1_19;
- step2_28 = step1_31 - step1_28;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
- "extp %[step3_19], $ac0, 31 \n\t"
-
- : [step3_19] "=r"(step3_19)
- : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
- [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
- step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_16 = step1_16 + step1_19;
- step3_17 = step1_17 + step1_18;
- step3_30 = step1_29 + step1_30;
- step3_31 = step1_28 + step1_31;
-
- step2_20 = step1_23 - step1_20;
- step2_27 = step1_24 - step1_27;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
- "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
- "extp %[step3_20], $ac0, 31 \n\t"
-
- : [step3_20] "=r"(step3_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
- step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_21 = step1_22 - step1_21;
- step2_26 = step1_25 - step1_26;
-
- __asm__ __volatile__(
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
- "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
- "extp %[step3_21], $ac1, 31 \n\t"
-
- : [step3_21] "=r"(step3_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
- [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64));
-
- temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
- step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_22 = step1_21 + step1_22;
- step3_23 = step1_20 + step1_23;
- step3_24 = step1_24 + step1_27;
- step3_25 = step1_25 + step1_26;
-
- step2_16 = step3_16 + step3_23;
- step2_17 = step3_17 + step3_22;
- step2_18 = step3_18 + step3_21;
- step2_19 = step3_19 + step3_20;
- step2_20 = step3_19 - step3_20;
- step2_21 = step3_18 - step3_21;
- step2_22 = step3_17 - step3_22;
- step2_23 = step3_16 - step3_23;
-
- step2_24 = step3_31 - step3_24;
- step2_25 = step3_30 - step3_25;
- step2_26 = step3_29 - step3_26;
- step2_27 = step3_28 - step3_27;
- step2_28 = step3_28 + step3_27;
- step2_29 = step3_29 + step3_26;
- step2_30 = step3_30 + step3_25;
- step2_31 = step3_31 + step3_24;
-
- __asm__ __volatile__(
- "lh %[load1], 0(%[input]) \n\t"
- "lh %[load2], 32(%[input]) \n\t"
- "lh %[load3], 16(%[input]) \n\t"
- "lh %[load4], 48(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "add %[result1], %[load1], %[load2] \n\t"
- "sub %[result2], %[load1], %[load2] \n\t"
- "madd $ac1, %[result1], %[cospi_16_64] \n\t"
- "madd $ac2, %[result2], %[cospi_16_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "madd $ac3, %[load3], %[cospi_24_64] \n\t"
- "msub $ac3, %[load4], %[cospi_8_64] \n\t"
- "extp %[temp2], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "madd $ac1, %[load3], %[cospi_8_64] \n\t"
- "madd $ac1, %[load4], %[cospi_24_64] \n\t"
- "extp %[temp3], $ac1, 31 \n\t"
-
- "add %[step1_0], %[temp0], %[temp3] \n\t"
- "add %[step1_1], %[temp1], %[temp2] \n\t"
- "sub %[step1_2], %[temp1], %[temp2] \n\t"
- "sub %[step1_3], %[temp0], %[temp3] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [result1] "=&r"(result1),
- [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64)
-
- );
-
- __asm__ __volatile__(
- "lh %[load1], 8(%[input]) \n\t"
- "lh %[load2], 56(%[input]) \n\t"
- "lh %[load3], 40(%[input]) \n\t"
- "lh %[load4], 24(%[input]) \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "madd $ac1, %[load1], %[cospi_28_64] \n\t"
- "msub $ac1, %[load2], %[cospi_4_64] \n\t"
- "extp %[temp0], $ac1, 31 \n\t"
-
- "madd $ac3, %[load1], %[cospi_4_64] \n\t"
- "madd $ac3, %[load2], %[cospi_28_64] \n\t"
- "extp %[temp3], $ac3, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
-
- "madd $ac2, %[load3], %[cospi_12_64] \n\t"
- "msub $ac2, %[load4], %[cospi_20_64] \n\t"
- "extp %[temp1], $ac2, 31 \n\t"
-
- "madd $ac1, %[load3], %[cospi_20_64] \n\t"
- "madd $ac1, %[load4], %[cospi_12_64] \n\t"
- "extp %[temp2], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mtlo %[const_2_power_13], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "sub %[load1], %[temp3], %[temp2] \n\t"
- "sub %[load1], %[load1], %[temp0] \n\t"
- "add %[load1], %[load1], %[temp1] \n\t"
-
- "sub %[load2], %[temp0], %[temp1] \n\t"
- "sub %[load2], %[load2], %[temp2] \n\t"
- "add %[load2], %[load2], %[temp3] \n\t"
-
- "madd $ac1, %[load1], %[cospi_16_64] \n\t"
- "madd $ac3, %[load2], %[cospi_16_64] \n\t"
-
- "extp %[step1_5], $ac1, 31 \n\t"
- "extp %[step1_6], $ac3, 31 \n\t"
- "add %[step1_4], %[temp0], %[temp1] \n\t"
- "add %[step1_7], %[temp3], %[temp2] \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
- : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_16_64] "r"(cospi_16_64));
-
- step2_0 = step1_0 + step1_7;
- step2_1 = step1_1 + step1_6;
- step2_2 = step1_2 + step1_5;
- step2_3 = step1_3 + step1_4;
- step2_4 = step1_3 - step1_4;
- step2_5 = step1_2 - step1_5;
- step2_6 = step1_1 - step1_6;
- step2_7 = step1_0 - step1_7;
-
- step1_0 = step2_0 + step3_15;
- step1_1 = step2_1 + step3_14;
- step1_2 = step2_2 + step3_13;
- step1_3 = step2_3 + step3_12;
- step1_4 = step2_4 + step3_11;
- step1_5 = step2_5 + step3_10;
- step1_6 = step2_6 + step3_9;
- step1_7 = step2_7 + step3_8;
- step1_8 = step2_7 - step3_8;
- step1_9 = step2_6 - step3_9;
- step1_10 = step2_5 - step3_10;
- step1_11 = step2_4 - step3_11;
- step1_12 = step2_3 - step3_12;
- step1_13 = step2_2 - step3_13;
- step1_14 = step2_1 - step3_14;
- step1_15 = step2_0 - step3_15;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_27], %[step2_20] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_20], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_20 + step2_27) * cospi_16_64;
- step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_26], %[step2_21] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_21], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
- [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_21 + step2_26) * cospi_16_64;
- step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_25], %[step2_22] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_22], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
- : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
- [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_22 + step2_25) * cospi_16_64;
- step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_24], %[step2_23] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_23], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
- : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
- [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_23 + step2_24) * cospi_16_64;
- step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- // final stage
- output[0 * 32] = step1_0 + step2_31;
- output[1 * 32] = step1_1 + step2_30;
- output[2 * 32] = step1_2 + step2_29;
- output[3 * 32] = step1_3 + step2_28;
- output[4 * 32] = step1_4 + step1_27;
- output[5 * 32] = step1_5 + step1_26;
- output[6 * 32] = step1_6 + step1_25;
- output[7 * 32] = step1_7 + step1_24;
- output[8 * 32] = step1_8 + step1_23;
- output[9 * 32] = step1_9 + step1_22;
- output[10 * 32] = step1_10 + step1_21;
- output[11 * 32] = step1_11 + step1_20;
- output[12 * 32] = step1_12 + step2_19;
- output[13 * 32] = step1_13 + step2_18;
- output[14 * 32] = step1_14 + step2_17;
- output[15 * 32] = step1_15 + step2_16;
- output[16 * 32] = step1_15 - step2_16;
- output[17 * 32] = step1_14 - step2_17;
- output[18 * 32] = step1_13 - step2_18;
- output[19 * 32] = step1_12 - step2_19;
- output[20 * 32] = step1_11 - step1_20;
- output[21 * 32] = step1_10 - step1_21;
- output[22 * 32] = step1_9 - step1_22;
- output[23 * 32] = step1_8 - step1_23;
- output[24 * 32] = step1_7 - step1_24;
- output[25 * 32] = step1_6 - step1_25;
- output[26 * 32] = step1_5 - step1_26;
- output[27 * 32] = step1_4 - step1_27;
- output[28 * 32] = step1_3 - step2_28;
- output[29 * 32] = step1_2 - step2_29;
- output[30 * 32] = step1_1 - step2_30;
- output[31 * 32] = step1_0 - step2_31;
-
- input += 32;
- output += 1;
- }
-}
-
-void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
- int16_t *outptr = out;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- // Rows
- idct32_rows_dspr2(input, outptr, 32);
-
- // Columns
- aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
- int stride) {
- DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
- int16_t *outptr = out;
- uint32_t i;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- // Rows
- idct32_rows_dspr2(input, outptr, 8);
-
- outptr += 8;
- __asm__ __volatile__(
- "sw $zero, 0(%[outptr]) \n\t"
- "sw $zero, 4(%[outptr]) \n\t"
- "sw $zero, 8(%[outptr]) \n\t"
- "sw $zero, 12(%[outptr]) \n\t"
- "sw $zero, 16(%[outptr]) \n\t"
- "sw $zero, 20(%[outptr]) \n\t"
- "sw $zero, 24(%[outptr]) \n\t"
- "sw $zero, 28(%[outptr]) \n\t"
- "sw $zero, 32(%[outptr]) \n\t"
- "sw $zero, 36(%[outptr]) \n\t"
- "sw $zero, 40(%[outptr]) \n\t"
- "sw $zero, 44(%[outptr]) \n\t"
-
- :
- : [outptr] "r"(outptr));
-
- for (i = 0; i < 31; ++i) {
- outptr += 32;
-
- __asm__ __volatile__(
- "sw $zero, 0(%[outptr]) \n\t"
- "sw $zero, 4(%[outptr]) \n\t"
- "sw $zero, 8(%[outptr]) \n\t"
- "sw $zero, 12(%[outptr]) \n\t"
- "sw $zero, 16(%[outptr]) \n\t"
- "sw $zero, 20(%[outptr]) \n\t"
- "sw $zero, 24(%[outptr]) \n\t"
- "sw $zero, 28(%[outptr]) \n\t"
- "sw $zero, 32(%[outptr]) \n\t"
- "sw $zero, 36(%[outptr]) \n\t"
- "sw $zero, 40(%[outptr]) \n\t"
- "sw $zero, 44(%[outptr]) \n\t"
-
- :
- : [outptr] "r"(outptr));
- }
-
- // Columns
- aom_idct32_cols_add_blk_dspr2(out, dest, stride);
-}
-
-void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int stride) {
- int r, out;
- int32_t a1, absa1;
- int32_t vector_a1;
- int32_t t1, t2, t3, t4;
- int32_t vector_1, vector_2, vector_3, vector_4;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
-
- :
- : [pos] "r"(pos));
-
- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__(
- "addi %[out], %[out], 32 \n\t"
- "sra %[a1], %[out], 6 \n\t"
-
- : [out] "+r"(out), [a1] "=r"(a1)
- :);
-
- if (a1 < 0) {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__(
- "abs %[absa1], %[a1] \n\t"
- "replv.qb %[vector_a1], %[absa1] \n\t"
-
- : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 32; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "lw %[t3], 8(%[dest]) \n\t"
- "lw %[t4], 12(%[dest]) \n\t"
- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "sw %[vector_3], 8(%[dest]) \n\t"
- "sw %[vector_4], 12(%[dest]) \n\t"
-
- "lw %[t1], 16(%[dest]) \n\t"
- "lw %[t2], 20(%[dest]) \n\t"
- "lw %[t3], 24(%[dest]) \n\t"
- "lw %[t4], 28(%[dest]) \n\t"
- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 16(%[dest]) \n\t"
- "sw %[vector_2], 20(%[dest]) \n\t"
- "sw %[vector_3], 24(%[dest]) \n\t"
- "sw %[vector_4], 28(%[dest]) \n\t"
-
- "add %[dest], %[dest], %[stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
- [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
- [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
- [dest] "+&r"(dest)
- : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
- }
- } else {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
-
- : [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 32; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "lw %[t3], 8(%[dest]) \n\t"
- "lw %[t4], 12(%[dest]) \n\t"
- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "sw %[vector_3], 8(%[dest]) \n\t"
- "sw %[vector_4], 12(%[dest]) \n\t"
-
- "lw %[t1], 16(%[dest]) \n\t"
- "lw %[t2], 20(%[dest]) \n\t"
- "lw %[t3], 24(%[dest]) \n\t"
- "lw %[t4], 28(%[dest]) \n\t"
- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
- "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
- "sw %[vector_1], 16(%[dest]) \n\t"
- "sw %[vector_2], 20(%[dest]) \n\t"
- "sw %[vector_3], 24(%[dest]) \n\t"
- "sw %[vector_4], 28(%[dest]) \n\t"
-
- "add %[dest], %[dest], %[stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
- [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
- [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
- [dest] "+&r"(dest)
- : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
- }
- }
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
deleted file mode 100644
index e6d0367cd..000000000
--- a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
- int16_t step_0, step_1, step_2, step_3;
- int Temp0, Temp1, Temp2, Temp3;
- const int const_2_power_13 = 8192;
- int i;
-
- for (i = 4; i--;) {
- __asm__ __volatile__(
- /*
- temp_1 = (input[0] + input[2]) * cospi_16_64;
- step_0 = dct_const_round_shift(temp_1);
-
- temp_2 = (input[0] - input[2]) * cospi_16_64;
- step_1 = dct_const_round_shift(temp_2);
- */
- "lh %[Temp0], 0(%[input]) \n\t"
- "lh %[Temp1], 4(%[input]) \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "add %[Temp2], %[Temp0], %[Temp1] \n\t"
- "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
- "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
- "lh %[Temp0], 2(%[input]) \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "extp %[step_0], $ac0, 31 \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
-
- "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
- "extp %[step_1], $ac1, 31 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- /*
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- step_2 = dct_const_round_shift(temp1);
- */
- "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
- "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
- "extp %[step_2], $ac0, 31 \n\t"
-
- /*
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step_3 = dct_const_round_shift(temp2);
- */
- "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
- "extp %[step_3], $ac1, 31 \n\t"
-
- /*
- output[0] = step_0 + step_3;
- output[4] = step_1 + step_2;
- output[8] = step_1 - step_2;
- output[12] = step_0 - step_3;
- */
- "add %[Temp0], %[step_0], %[step_3] \n\t"
- "sh %[Temp0], 0(%[output]) \n\t"
-
- "add %[Temp1], %[step_1], %[step_2] \n\t"
- "sh %[Temp1], 8(%[output]) \n\t"
-
- "sub %[Temp2], %[step_1], %[step_2] \n\t"
- "sh %[Temp2], 16(%[output]) \n\t"
-
- "sub %[Temp3], %[step_0], %[step_3] \n\t"
- "sh %[Temp3], 24(%[output]) \n\t"
-
- : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
- [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
- : [const_2_power_13] "r"(const_2_power_13),
- [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
- [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
-
- input += 4;
- output += 1;
- }
-}
-
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int16_t step_0, step_1, step_2, step_3;
- int Temp0, Temp1, Temp2, Temp3;
- const int const_2_power_13 = 8192;
- int i;
- uint8_t *dest_pix;
- uint8_t *cm = aom_ff_cropTbl;
-
- /* prefetch aom_ff_cropTbl */
- prefetch_load(aom_ff_cropTbl);
- prefetch_load(aom_ff_cropTbl + 32);
- prefetch_load(aom_ff_cropTbl + 64);
- prefetch_load(aom_ff_cropTbl + 96);
- prefetch_load(aom_ff_cropTbl + 128);
- prefetch_load(aom_ff_cropTbl + 160);
- prefetch_load(aom_ff_cropTbl + 192);
- prefetch_load(aom_ff_cropTbl + 224);
-
- for (i = 0; i < 4; ++i) {
- dest_pix = (dest + i);
-
- __asm__ __volatile__(
- /*
- temp_1 = (input[0] + input[2]) * cospi_16_64;
- step_0 = dct_const_round_shift(temp_1);
-
- temp_2 = (input[0] - input[2]) * cospi_16_64;
- step_1 = dct_const_round_shift(temp_2);
- */
- "lh %[Temp0], 0(%[input]) \n\t"
- "lh %[Temp1], 4(%[input]) \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "add %[Temp2], %[Temp0], %[Temp1] \n\t"
- "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
- "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
- "lh %[Temp0], 2(%[input]) \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "extp %[step_0], $ac0, 31 \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
-
- "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
- "extp %[step_1], $ac1, 31 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- /*
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- step_2 = dct_const_round_shift(temp1);
- */
- "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
- "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
- "extp %[step_2], $ac0, 31 \n\t"
-
- /*
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step_3 = dct_const_round_shift(temp2);
- */
- "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
- "extp %[step_3], $ac1, 31 \n\t"
-
- /*
- output[0] = step_0 + step_3;
- output[4] = step_1 + step_2;
- output[8] = step_1 - step_2;
- output[12] = step_0 - step_3;
- */
- "add %[Temp0], %[step_0], %[step_3] \n\t"
- "addi %[Temp0], %[Temp0], 8 \n\t"
- "sra %[Temp0], %[Temp0], 4 \n\t"
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "add %[Temp0], %[step_1], %[step_2] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "addi %[Temp0], %[Temp0], 8 \n\t"
- "sra %[Temp0], %[Temp0], 4 \n\t"
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step_1], %[step_2] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "addi %[Temp0], %[Temp0], 8 \n\t"
- "sra %[Temp0], %[Temp0], 4 \n\t"
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step_0], %[step_3] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "addi %[Temp0], %[Temp0], 8 \n\t"
- "sra %[Temp0], %[Temp0], 4 \n\t"
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
-
- : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
- [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
- [dest_pix] "+r"(dest_pix)
- : [const_2_power_13] "r"(const_2_power_13),
- [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
- [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
-
- input += 4;
- }
-}
-
-void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
- int16_t *outptr = out;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- // Rows
- aom_idct4_rows_dspr2(input, outptr);
-
- // Columns
- aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- int a1, absa1;
- int r;
- int32_t out;
- int t2, vector_a1, vector_a;
- uint32_t pos = 45;
- int16_t input_dc = input[0];
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
-
- :
- : [pos] "r"(pos));
-
- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
- __asm__ __volatile__(
- "addi %[out], %[out], 8 \n\t"
- "sra %[a1], %[out], 4 \n\t"
-
- : [out] "+r"(out), [a1] "=r"(a1)
- :);
-
- if (a1 < 0) {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__(
- "abs %[absa1], %[a1] \n\t"
- "replv.qb %[vector_a1], %[absa1] \n\t"
-
- : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 4; r--;) {
- __asm__ __volatile__(
- "lw %[t2], 0(%[dest]) \n\t"
- "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
- "sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- } else {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 4; r--;) {
- __asm__ __volatile__(
- "lw %[t2], 0(%[dest]) \n\t"
- "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
- "sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- }
-}
-
-void iadst4_dspr2(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
- int x0, x1, x2, x3;
-
- x0 = input[0];
- x1 = input[1];
- x2 = input[2];
- x3 = input[3];
-
- if (!(x0 | x1 | x2 | x3)) {
- output[0] = output[1] = output[2] = output[3] = 0;
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = x0 - x2 + x3;
-
- x0 = s0 + s3 + s5;
- x1 = s1 - s4 - s6;
- x2 = sinpi_3_9 * s7;
- x3 = s2;
-
- s0 = x0 + x3;
- s1 = x1 + x3;
- s2 = x2;
- s3 = x0 + x1 - x3;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = dct_const_round_shift(s0);
- output[1] = dct_const_round_shift(s1);
- output[2] = dct_const_round_shift(s2);
- output[3] = dct_const_round_shift(s3);
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
deleted file mode 100644
index 0a20f76f2..000000000
--- a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
- int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
- const int const_2_power_13 = 8192;
- int Temp0, Temp1, Temp2, Temp3, Temp4;
- int i;
-
- for (i = no_rows; i--;) {
- __asm__ __volatile__(
- /*
- temp_1 = (input[0] + input[4]) * cospi_16_64;
- step2_0 = dct_const_round_shift(temp_1);
-
- temp_2 = (input[0] - input[4]) * cospi_16_64;
- step2_1 = dct_const_round_shift(temp_2);
- */
- "lh %[Temp0], 0(%[input]) \n\t"
- "lh %[Temp1], 8(%[input]) \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "add %[Temp2], %[Temp0], %[Temp1] \n\t"
- "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
- "extp %[Temp4], $ac0, 31 \n\t"
-
- "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
- "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- /*
- temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
- step2_2 = dct_const_round_shift(temp_1);
- */
- "lh %[Temp0], 4(%[input]) \n\t"
- "lh %[Temp1], 12(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
- "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "extp %[Temp3], $ac0, 31 \n\t"
-
- /*
- step1_1 = step2_1 + step2_2;
- step1_2 = step2_1 - step2_2;
- */
- "add %[step1_1], %[Temp2], %[Temp3] \n\t"
- "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
-
- /*
- temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
- step2_3 = dct_const_round_shift(temp_2);
- */
- "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
-
- /*
- step1_0 = step2_0 + step2_3;
- step1_3 = step2_0 - step2_3;
- */
- "add %[step1_0], %[Temp4], %[Temp1] \n\t"
- "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
-
- /*
- temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- step1_4 = dct_const_round_shift(temp_1);
- */
- "lh %[Temp0], 2(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "lh %[Temp1], 14(%[input]) \n\t"
- "lh %[Temp0], 2(%[input]) \n\t"
- "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
- "extp %[step1_4], $ac0, 31 \n\t"
-
- /*
- temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1_7 = dct_const_round_shift(temp_2);
- */
- "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
- "extp %[step1_7], $ac1, 31 \n\t"
-
- /*
- temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- step1_5 = dct_const_round_shift(temp_1);
- */
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "lh %[Temp0], 10(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
- "extp %[step1_5], $ac0, 31 \n\t"
-
- /*
- temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1_6 = dct_const_round_shift(temp_2);
- */
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "lh %[Temp0], 10(%[input]) \n\t"
- "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
- "extp %[step1_6], $ac1, 31 \n\t"
-
- /*
- temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
- temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
- */
- "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
- "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
- "add %[Temp0], %[Temp0], %[step1_5] \n\t"
- "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
- "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
- "add %[Temp1], %[Temp1], %[step1_7] \n\t"
-
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
-
- /*
- step1_4 = step1_4 + step1_5;
- step1_7 = step1_6 + step1_7;
- */
- "add %[step1_4], %[step1_4], %[step1_5] \n\t"
- "add %[step1_7], %[step1_7], %[step1_6] \n\t"
-
- "extp %[step1_5], $ac0, 31 \n\t"
- "extp %[step1_6], $ac1, 31 \n\t"
-
- "add %[Temp0], %[step1_0], %[step1_7] \n\t"
- "sh %[Temp0], 0(%[output]) \n\t"
- "add %[Temp1], %[step1_1], %[step1_6] \n\t"
- "sh %[Temp1], 16(%[output]) \n\t"
- "add %[Temp0], %[step1_2], %[step1_5] \n\t"
- "sh %[Temp0], 32(%[output]) \n\t"
- "add %[Temp1], %[step1_3], %[step1_4] \n\t"
- "sh %[Temp1], 48(%[output]) \n\t"
-
- "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
- "sh %[Temp0], 64(%[output]) \n\t"
- "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
- "sh %[Temp1], 80(%[output]) \n\t"
- "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
- "sh %[Temp0], 96(%[output]) \n\t"
- "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
- "sh %[Temp1], 112(%[output]) \n\t"
-
- : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
- [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
- [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
- [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
- [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
- : [const_2_power_13] "r"(const_2_power_13),
- [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
- [input] "r"(input));
-
- input += 8;
- output += 1;
- }
-}
-
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
- int Temp0, Temp1, Temp2, Temp3;
- int i;
- const int const_2_power_13 = 8192;
- uint8_t *dest_pix;
- uint8_t *cm = aom_ff_cropTbl;
-
- /* prefetch aom_ff_cropTbl */
- prefetch_load(aom_ff_cropTbl);
- prefetch_load(aom_ff_cropTbl + 32);
- prefetch_load(aom_ff_cropTbl + 64);
- prefetch_load(aom_ff_cropTbl + 96);
- prefetch_load(aom_ff_cropTbl + 128);
- prefetch_load(aom_ff_cropTbl + 160);
- prefetch_load(aom_ff_cropTbl + 192);
- prefetch_load(aom_ff_cropTbl + 224);
-
- for (i = 0; i < 8; ++i) {
- dest_pix = (dest + i);
-
- __asm__ __volatile__(
- /*
- temp_1 = (input[0] + input[4]) * cospi_16_64;
- step2_0 = dct_const_round_shift(temp_1);
-
- temp_2 = (input[0] - input[4]) * cospi_16_64;
- step2_1 = dct_const_round_shift(temp_2);
- */
- "lh %[Temp0], 0(%[input]) \n\t"
- "lh %[Temp1], 8(%[input]) \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "add %[Temp2], %[Temp0], %[Temp1] \n\t"
- "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
- "extp %[step1_6], $ac0, 31 \n\t"
-
- "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
- "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- /*
- temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
- step2_2 = dct_const_round_shift(temp_1);
- */
- "lh %[Temp0], 4(%[input]) \n\t"
- "lh %[Temp1], 12(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
- "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "extp %[Temp3], $ac0, 31 \n\t"
-
- /*
- step1_1 = step2_1 + step2_2;
- step1_2 = step2_1 - step2_2;
- */
- "add %[step1_1], %[Temp2], %[Temp3] \n\t"
- "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
-
- /*
- temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
- step2_3 = dct_const_round_shift(temp_2);
- */
- "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
-
- /*
- step1_0 = step2_0 + step2_3;
- step1_3 = step2_0 - step2_3;
- */
- "add %[step1_0], %[step1_6], %[Temp1] \n\t"
- "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
-
- /*
- temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- step1_4 = dct_const_round_shift(temp_1);
- */
- "lh %[Temp0], 2(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "lh %[Temp1], 14(%[input]) \n\t"
- "lh %[Temp0], 2(%[input]) \n\t"
- "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
- "extp %[step1_4], $ac0, 31 \n\t"
-
- /*
- temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1_7 = dct_const_round_shift(temp_2);
- */
- "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
- "extp %[step1_7], $ac1, 31 \n\t"
-
- /*
- temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- step1_5 = dct_const_round_shift(temp_1);
- */
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "lh %[Temp0], 10(%[input]) \n\t"
- "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
- "extp %[step1_5], $ac0, 31 \n\t"
-
- /*
- temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1_6 = dct_const_round_shift(temp_2);
- */
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "lh %[Temp0], 10(%[input]) \n\t"
- "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
- "lh %[Temp1], 6(%[input]) \n\t"
- "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
- "extp %[step1_6], $ac1, 31 \n\t"
-
- /*
- temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
- temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
- */
- "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
- "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
- "add %[Temp0], %[Temp0], %[step1_5] \n\t"
- "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
- "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
- "add %[Temp1], %[Temp1], %[step1_7] \n\t"
-
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mtlo %[const_2_power_13], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
-
- "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
- "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
-
- /*
- step1_4 = step1_4 + step1_5;
- step1_7 = step1_6 + step1_7;
- */
- "add %[step1_4], %[step1_4], %[step1_5] \n\t"
- "add %[step1_7], %[step1_7], %[step1_6] \n\t"
-
- "extp %[step1_5], $ac0, 31 \n\t"
- "extp %[step1_6], $ac1, 31 \n\t"
-
- /* add block */
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "add %[Temp0], %[step1_0], %[step1_7] \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "add %[Temp0], %[step1_1], %[step1_6] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "add %[Temp0], %[step1_2], %[step1_5] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "add %[Temp0], %[step1_3], %[step1_4] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
-
- "lbu %[Temp1], 0(%[dest_pix]) \n\t"
- "addi %[Temp0], %[Temp0], 16 \n\t"
- "sra %[Temp0], %[Temp0], 5 \n\t"
- "add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
-
- : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
- [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
- [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
- [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
- [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
- : [const_2_power_13] "r"(const_2_power_13),
- [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
- [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
- [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
-
- input += 8;
- }
-}
-
-void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
- int16_t *outptr = out;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
-
- // First transform rows
- idct8_rows_dspr2(input, outptr, 8);
-
- // Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
- int16_t *outptr = out;
- uint32_t pos = 45;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
-
- // First transform rows
- idct8_rows_dspr2(input, outptr, 4);
-
- outptr += 4;
-
- __asm__ __volatile__(
- "sw $zero, 0(%[outptr]) \n\t"
- "sw $zero, 4(%[outptr]) \n\t"
- "sw $zero, 16(%[outptr]) \n\t"
- "sw $zero, 20(%[outptr]) \n\t"
- "sw $zero, 32(%[outptr]) \n\t"
- "sw $zero, 36(%[outptr]) \n\t"
- "sw $zero, 48(%[outptr]) \n\t"
- "sw $zero, 52(%[outptr]) \n\t"
- "sw $zero, 64(%[outptr]) \n\t"
- "sw $zero, 68(%[outptr]) \n\t"
- "sw $zero, 80(%[outptr]) \n\t"
- "sw $zero, 84(%[outptr]) \n\t"
- "sw $zero, 96(%[outptr]) \n\t"
- "sw $zero, 100(%[outptr]) \n\t"
- "sw $zero, 112(%[outptr]) \n\t"
- "sw $zero, 116(%[outptr]) \n\t"
-
- :
- : [outptr] "r"(outptr));
-
- // Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
- uint32_t pos = 45;
- int32_t out;
- int32_t r;
- int32_t a1, absa1;
- int32_t t1, t2, vector_a1, vector_1, vector_2;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
-
- :
- : [pos] "r"(pos));
-
- out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
- __asm__ __volatile__(
- "addi %[out], %[out], 16 \n\t"
- "sra %[a1], %[out], 5 \n\t"
-
- : [out] "+r"(out), [a1] "=r"(a1)
- :);
-
- if (a1 < 0) {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__(
- "abs %[absa1], %[a1] \n\t"
- "replv.qb %[vector_a1], %[absa1] \n\t"
-
- : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 8; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
- [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- } else {
- /* use quad-byte
- * input and output memory are four byte aligned */
- __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
-
- : [vector_a1] "=r"(vector_a1)
- : [a1] "r"(a1));
-
- for (r = 8; r--;) {
- __asm__ __volatile__(
- "lw %[t1], 0(%[dest]) \n\t"
- "lw %[t2], 4(%[dest]) \n\t"
- "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
- "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
- "sw %[vector_1], 0(%[dest]) \n\t"
- "sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
-
- : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
- [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
- }
- }
-}
-
-void iadst8_dspr2(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
- int x0, x1, x2, x3, x4, x5, x6, x7;
-
- x0 = input[7];
- x1 = input[0];
- x2 = input[5];
- x3 = input[2];
- x4 = input[3];
- x5 = input[4];
- x6 = input[1];
- x7 = input[6];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = 0;
- return;
- }
-
- // stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
- x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
- x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
- x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
- x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
- x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
- x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
- x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
- x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
- x0 = s0 + s2;
- x1 = s1 + s3;
- x2 = s0 - s2;
- x3 = s1 - s3;
- x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
- x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
- x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
- x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
-
- // stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
-
- x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
- x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
- x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
- x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
-
- output[0] = x0;
- output[1] = -x4;
- output[2] = x6;
- output[3] = -x2;
- output[4] = x3;
- output[5] = -x7;
- output[6] = x5;
- output[7] = -x1;
-}
-#endif // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
index fc0c32ce3..38a10e9b2 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
@@ -404,10 +404,11 @@ void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
}
}
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr, int32_t count) {
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr,
+ int32_t count) {
DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
uint8_t early_exit = 0;
@@ -639,19 +640,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
}
}
} else {
- aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
- thresh_ptr, count);
+ mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+ count);
}
}
-void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
+void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
}
-void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr,
const uint8_t *thresh_ptr) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
index 883d0523d..8c41278be 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -11,7 +11,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
index 72df09823..3e38ef3fb 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -14,7 +14,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
index 3e6994714..cb599cf2e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -14,7 +14,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
index 8db3e521f..6db1dac08 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -14,7 +14,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
index a3b5a9eb1..b67ccfe9d 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -11,7 +11,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 8d2fd69f7..34733e42e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -11,7 +11,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
@@ -718,14 +719,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
}
}
-void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
+void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
}
-void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
index 28528869b..3d3f1ec97 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -11,7 +11,8 @@
#include <stdlib.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
index 48fbcfd47..eb919d42b 100644
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -14,7 +14,8 @@
#include <msa.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
#include "aom/aom_integer.h"
#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
index 258eb5c07..58cdd80d9 100644
--- a/third_party/aom/aom_dsp/mips/sad_msa.c
+++ b/third_party/aom/aom_dsp/mips/sad_msa.c
@@ -9,7 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/macros_msa.h"
#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
@@ -160,640 +161,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
return sad;
}
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v16u8 ref0, ref1, ref2, ref3, diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- INSERT_W4_UB(src0, src1, src2, src3, src);
-
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
- ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
- ref0, ref1);
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src, ref, ref0, ref1, diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
-
- for (ht_cnt = height >> 1; ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
- v8u16 sad0_0 = { 0 };
- v8u16 sad0_1 = { 0 };
- v8u16 sad1_0 = { 0 };
- v8u16 sad1_1 = { 0 };
- v8u16 sad2_0 = { 0 };
- v8u16 sad2_1 = { 0 };
- v4u32 sad;
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
- ref0_4 = LD_UB(ref + 64);
- ref += ref_stride;
-
- sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[0] = HADD_SW_S32((v4i32)sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[1] = HADD_SW_S32((v4i32)sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3, diff;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- INSERT_W4_UB(src0, src1, src2, src3, src);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
- SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
- SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
- ref += (4 * ref_stride);
- PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
- ref0, ref1);
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
- SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
- PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
- sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src, ref0, ref1, ref;
- v16u8 diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
- diff = __msa_asub_u_b(src, ref);
- sad4 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
- diff = __msa_asub_u_b(src, ref);
- sad5 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
- diff = __msa_asub_u_b(src, ref);
- sad6 += __msa_hadd_u_h(diff, diff);
-
- ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
- diff = __msa_asub_u_b(src, ref);
- sad7 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- int32_t ht_cnt;
- v16u8 src0, src1;
- v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
- v8u16 sad4 = { 0 };
- v8u16 sad5 = { 0 };
- v8u16 sad6 = { 0 };
- v8u16 sad7 = { 0 };
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
- ref += ref_stride;
-
- sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
- sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
- sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
- sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
- sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
- sad_array[4] = HADD_UH_U32(sad4);
- sad_array[5] = HADD_UH_U32(sad5);
- sad_array[6] = HADD_UH_U32(sad6);
- sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, uint32_t *sad_array) {
- const uint8_t *src_dup, *ref_dup;
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 sad0_0 = { 0 };
- v8u16 sad0_1 = { 0 };
- v8u16 sad1_0 = { 0 };
- v8u16 sad1_1 = { 0 };
- v8u16 sad2_0 = { 0 };
- v8u16 sad2_1 = { 0 };
- v8u16 sad3_0 = { 0 };
- v8u16 sad3_1 = { 0 };
- v4u32 sad;
-
- src_dup = src;
- ref_dup = ref;
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
- ref += ref_stride;
-
- sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
- sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[0] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[1] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[2] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad3_0, sad3_0);
- sad += __msa_hadd_u_w(sad3_1, sad3_1);
- sad_array[3] = HADD_SW_S32(sad);
-
- sad0_0 = (v8u16)__msa_ldi_h(0);
- sad0_1 = (v8u16)__msa_ldi_h(0);
- sad1_0 = (v8u16)__msa_ldi_h(0);
- sad1_1 = (v8u16)__msa_ldi_h(0);
- sad2_0 = (v8u16)__msa_ldi_h(0);
- sad2_1 = (v8u16)__msa_ldi_h(0);
- sad3_0 = (v8u16)__msa_ldi_h(0);
- sad3_1 = (v8u16)__msa_ldi_h(0);
-
- for (ht_cnt = 64; ht_cnt--;) {
- LD_UB4(src_dup, 16, src0, src1, src2, src3);
- src_dup += src_stride;
- LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
- ref_dup += ref_stride;
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
- sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
- SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
- sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = __msa_hadd_u_w(sad0_0, sad0_0);
- sad += __msa_hadd_u_w(sad0_1, sad0_1);
- sad_array[4] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad1_0, sad1_0);
- sad += __msa_hadd_u_w(sad1_1, sad1_1);
- sad_array[5] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad2_0, sad2_0);
- sad += __msa_hadd_u_w(sad2_1, sad2_1);
- sad_array[6] = HADD_SW_S32(sad);
-
- sad = __msa_hadd_u_w(sad3_0, sad3_0);
- sad += __msa_hadd_u_w(sad3_1, sad3_1);
- sad_array[7] = HADD_SW_S32(sad);
-}
-
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t height,
@@ -1290,76 +657,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
}
-#define AOM_SAD_4xHEIGHTx3_MSA(height) \
- void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_8xHEIGHTx3_MSA(height) \
- void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_16xHEIGHTx3_MSA(height) \
- void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_32xHEIGHTx3_MSA(height) \
- void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_64xHEIGHTx3_MSA(height) \
- void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_4xHEIGHTx8_MSA(height) \
- void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_8xHEIGHTx8_MSA(height) \
- void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_16xHEIGHTx8_MSA(height) \
- void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_32xHEIGHTx8_MSA(height) \
- void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_64xHEIGHTx8_MSA(height) \
- void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
- sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
- }
-
#define AOM_SAD_4xHEIGHTx4D_MSA(height) \
void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \
@@ -1438,92 +735,66 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
/* clang-format off */
// 64x64
AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx3_MSA(64)
-AOM_SAD_64xHEIGHTx8_MSA(64)
AOM_SAD_64xHEIGHTx4D_MSA(64)
AOM_AVGSAD_64xHEIGHT_MSA(64)
// 64x32
AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx3_MSA(32)
-AOM_SAD_64xHEIGHTx8_MSA(32)
AOM_SAD_64xHEIGHTx4D_MSA(32)
AOM_AVGSAD_64xHEIGHT_MSA(32)
// 32x64
AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx3_MSA(64)
-AOM_SAD_32xHEIGHTx8_MSA(64)
AOM_SAD_32xHEIGHTx4D_MSA(64)
AOM_AVGSAD_32xHEIGHT_MSA(64)
// 32x32
AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx3_MSA(32)
-AOM_SAD_32xHEIGHTx8_MSA(32)
AOM_SAD_32xHEIGHTx4D_MSA(32)
AOM_AVGSAD_32xHEIGHT_MSA(32)
// 32x16
AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx3_MSA(16)
-AOM_SAD_32xHEIGHTx8_MSA(16)
AOM_SAD_32xHEIGHTx4D_MSA(16)
AOM_AVGSAD_32xHEIGHT_MSA(16)
// 16x32
AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx3_MSA(32)
-AOM_SAD_16xHEIGHTx8_MSA(32)
AOM_SAD_16xHEIGHTx4D_MSA(32)
AOM_AVGSAD_16xHEIGHT_MSA(32)
// 16x16
AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx3_MSA(16)
-AOM_SAD_16xHEIGHTx8_MSA(16)
AOM_SAD_16xHEIGHTx4D_MSA(16)
AOM_AVGSAD_16xHEIGHT_MSA(16)
// 16x8
AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx3_MSA(8)
-AOM_SAD_16xHEIGHTx8_MSA(8)
AOM_SAD_16xHEIGHTx4D_MSA(8)
AOM_AVGSAD_16xHEIGHT_MSA(8)
// 8x16
AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx3_MSA(16)
-AOM_SAD_8xHEIGHTx8_MSA(16)
AOM_SAD_8xHEIGHTx4D_MSA(16)
AOM_AVGSAD_8xHEIGHT_MSA(16)
// 8x8
AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx3_MSA(8)
-AOM_SAD_8xHEIGHTx8_MSA(8)
AOM_SAD_8xHEIGHTx4D_MSA(8)
AOM_AVGSAD_8xHEIGHT_MSA(8)
// 8x4
AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx3_MSA(4)
-AOM_SAD_8xHEIGHTx8_MSA(4)
AOM_SAD_8xHEIGHTx4D_MSA(4)
AOM_AVGSAD_8xHEIGHT_MSA(4)
// 4x8
AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx3_MSA(8)
-AOM_SAD_4xHEIGHTx8_MSA(8)
AOM_SAD_4xHEIGHTx4D_MSA(8)
AOM_AVGSAD_4xHEIGHT_MSA(8)
// 4x4
AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx3_MSA(4)
-AOM_SAD_4xHEIGHTx8_MSA(4)
AOM_SAD_4xHEIGHTx4D_MSA(4)
AOM_AVGSAD_4xHEIGHT_MSA(4)
/* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
index 3eb85107d..a8ee85b6b 100644
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -9,7 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_ports/mem.h"
#include "aom_dsp/mips/macros_msa.h"
#include "aom_dsp/variance.h"
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
index 37b89765d..bfed773ac 100644
--- a/third_party/aom/aom_dsp/mips/subtract_msa.c
+++ b/third_party/aom/aom_dsp/mips/subtract_msa.c
@@ -9,7 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/macros_msa.h"
static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
deleted file mode 100644
index cba5d4445..000000000
--- a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
- { \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
- \
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
- ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
- \
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
- SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
- out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
- }
-
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \
- dst1, dst2, dst3) \
- { \
- v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
- v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
- \
- DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \
- tp4_m); \
- DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \
- tp8_m); \
- BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
- BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
- SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
- SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
- PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
- dst1, dst2, dst3); \
- }
-
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \
- ({ \
- v8i16 dst_m; \
- v4i32 tp0_m, tp1_m; \
- \
- DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
- SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
- dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
- \
- dst_m; \
- })
-
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \
- { \
- v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
- v8i16 madd_s0_m, madd_s1_m; \
- \
- ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
- madd0_m, madd1_m, madd2_m, madd3_m); \
- SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
- }
-
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
- out2, out3) \
- { \
- v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
- v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
- \
- ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
- ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
- cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
- DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
- cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
- SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
- PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
- }
-#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
index 745fdfc9c..065c09ac5 100644
--- a/third_party/aom/aom_dsp/mips/variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/variance_msa.c
@@ -9,7 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/mips/macros_msa.h"
#define CALC_MSE_B(src, ref, var) \