diff options
Diffstat (limited to 'third_party/aom/aom_dsp/arm')
46 files changed, 1144 insertions, 11737 deletions
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c deleted file mode 100644 index 09429d6d2..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_y; - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - src += 7; - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz - s = src; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(src + 64); - - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(src + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(src + 64 + src_stride * 2); - - d = dst; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - __builtin_prefetch(src + 64 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - d = dst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - src += src_stride * 4 - w - 7; - dst += dst_stride * 4 - w; - } - return; -} - -void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - uint8x16_t q1u8, q3u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - assert(y_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_x; - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - d -= dst_stride * 3; - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - q1u8 = vcombine_u8(d2u8, d3u8); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm deleted file mode 100644 index 80aef992d..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; AV1_FILTER_WEIGHT == 128 - ; AV1_FILTER_SHIFT == 7 - - EXPORT |aom_convolve8_avg_horiz_neon| - EXPORT |aom_convolve8_avg_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|aom_convolve8_avg_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -aom_convolve8_avg_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -aom_convolve8_avg_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; slightly out of order load to match the existing data - vld1.u32 {d6[0]}, [r2], r3 - vld1.u32 {d7[0]}, [r2], r3 - vld1.u32 {d6[1]}, [r2], r3 - vld1.u32 {d7[1]}, [r2], r3 - - sub r2, r2, r3, lsl #2 ; reset for store - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_avg_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt aom_convolve8_avg_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|aom_convolve8_avg_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -aom_convolve8_avg_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -aom_convolve8_avg_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - vld1.u32 {d6[0]}, [r5@32], r3 - vld1.u32 {d6[1]}, [r8@32], r3 - vld1.u32 {d7[0]}, [r5@32], r3 - vld1.u32 {d7[1]}, [r8@32], r3 - - pld [r7] - pld [r4] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r5] - pld [r8] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - sub r5, r5, r3, lsl #1 ; reset for store - sub r8, r8, r3, lsl #1 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt aom_convolve8_avg_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_avg_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c deleted file mode 100644 index 8ebffb5f9..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s, *psrc; - uint8_t *d, *pdst; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_y; - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4, src += src_stride * 4, - dst += dst_stride * 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - __builtin_prefetch(src + src_stride * 6); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w, psrc = src + 7, pdst = dst; width > 0; - width -= 4, psrc += 4, pdst += 4) { // loop_horiz - s = psrc; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(psrc + 64); - - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(psrc + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(psrc + 64 + src_stride * 2); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - __builtin_prefetch(psrc + 60 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); - d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); - - d = pdst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - } - return; -} - -void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint32x2_t d2u32, d3u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - assert(y_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_x; - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); - d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm deleted file mode 100644 index 38207d864..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; AV1_FILTER_WEIGHT == 128 - ; AV1_FILTER_SHIFT == 7 - - EXPORT |aom_convolve8_horiz_neon| - EXPORT |aom_convolve8_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|aom_convolve8_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -aom_convolve8_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -aom_convolve8_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt aom_convolve8_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|aom_convolve8_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -aom_convolve8_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -aom_convolve8_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - pld [r5] - pld [r8] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r7] - pld [r4] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt aom_convolve8_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c deleted file mode 100644 index f05d3ceae..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" - -void aom_convolve_avg_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - uint8_t *d; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint32x2_t d0u32, d2u32; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - d = dst; - if (w > 32) { // avg64 - for (; h > 0; h -= 1) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - q10u8 = vld1q_u8(d + 32); - q11u8 = vld1q_u8(d + 48); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // avg32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - d += dst_stride; - q10u8 = vld1q_u8(d); - q11u8 = vld1q_u8(d + 16); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // avg16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - q2u8 = vld1q_u8(d); - d += dst_stride; - q3u8 = vld1q_u8(d); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q2u8); - q1u8 = vrhaddq_u8(q1u8, q3u8); - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // avg8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d1u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(d); - d += dst_stride; - d3u8 = vld1_u8(d); - d += dst_stride; - - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - q0u8 = vrhaddq_u8(q0u8, q1u8); - - vst1_u8(dst, vget_low_u8(q0u8)); - dst += dst_stride; - vst1_u8(dst, vget_high_u8(q0u8)); - dst += dst_stride; - } - } else { // avg4 - for (; h > 0; h -= 2) { - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); - src += src_stride; - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); - src += src_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); - d += dst_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); - d += dst_stride; - - d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); - - d0u32 = vreinterpret_u32_u8(d0u8); - vst1_lane_u32((uint32_t *)dst, d0u32, 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, d0u32, 1); - dst += dst_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm deleted file mode 100644 index 43c300954..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm +++ /dev/null @@ -1,119 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_convolve_avg_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_convolve_avg_neon| PROC - push {r4-r6, lr} - ldrd r4, r5, [sp, #32] - mov r6, r2 - - cmp r4, #32 - bgt avg64 - beq avg32 - cmp r4, #8 - bgt avg16 - beq avg8 - b avg4 - -avg64 - sub lr, r1, #32 - sub r4, r3, #32 -avg64_h - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0]! - vld1.8 {q2-q3}, [r0], lr - pld [r2, r3] - vld1.8 {q8-q9}, [r6@128]! - vld1.8 {q10-q11}, [r6@128], r4 - vrhadd.u8 q0, q0, q8 - vrhadd.u8 q1, q1, q9 - vrhadd.u8 q2, q2, q10 - vrhadd.u8 q3, q3, q11 - vst1.8 {q0-q1}, [r2@128]! - vst1.8 {q2-q3}, [r2@128], r4 - subs r5, r5, #1 - bgt avg64_h - pop {r4-r6, pc} - -avg32 - vld1.8 {q0-q1}, [r0], r1 - vld1.8 {q2-q3}, [r0], r1 - vld1.8 {q8-q9}, [r6@128], r3 - vld1.8 {q10-q11}, [r6@128], r3 - pld [r0] - vrhadd.u8 q0, q0, q8 - pld [r0, r1] - vrhadd.u8 q1, q1, q9 - pld [r6] - vrhadd.u8 q2, q2, q10 - pld [r6, r3] - vrhadd.u8 q3, q3, q11 - vst1.8 {q0-q1}, [r2@128], r3 - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #2 - bgt avg32 - pop {r4-r6, pc} - -avg16 - vld1.8 {q0}, [r0], r1 - vld1.8 {q1}, [r0], r1 - vld1.8 {q2}, [r6@128], r3 - vld1.8 {q3}, [r6@128], r3 - pld [r0] - pld [r0, r1] - vrhadd.u8 q0, q0, q2 - pld [r6] - pld [r6, r3] - vrhadd.u8 q1, q1, q3 - vst1.8 {q0}, [r2@128], r3 - vst1.8 {q1}, [r2@128], r3 - subs r5, r5, #2 - bgt avg16 - pop {r4-r6, pc} - -avg8 - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r0], r1 - vld1.8 {d2}, [r6@64], r3 - vld1.8 {d3}, [r6@64], r3 - pld [r0] - pld [r0, r1] - vrhadd.u8 q0, q0, q1 - pld [r6] - pld [r6, r3] - vst1.8 {d0}, [r2@64], r3 - vst1.8 {d1}, [r2@64], r3 - subs r5, r5, #2 - bgt avg8 - pop {r4-r6, pc} - -avg4 - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d2[0]}, [r6@32], r3 - vld1.32 {d2[1]}, [r6@32], r3 - vrhadd.u8 d0, d0, d2 - vst1.32 {d0[0]}, [r2@32], r3 - vst1.32 {d0[1]}, [r2@32], r3 - subs r5, r5, #2 - bgt avg4 - pop {r4-r6, pc} - ENDP - - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c deleted file mode 100644 index 9e57c7176..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" - -void aom_convolve_copy_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - uint8x8_t d0u8, d2u8; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - if (w > 32) { // copy64 - for (; h > 0; h--) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // copy32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // copy16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // copy8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(src); - src += src_stride; - - vst1_u8(dst, d0u8); - dst += dst_stride; - vst1_u8(dst, d2u8); - dst += dst_stride; - } - } else { // copy4 - for (; h > 0; h--) { - *(uint32_t *)dst = *(const uint32_t *)src; - src += src_stride; - dst += dst_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm deleted file mode 100644 index 443d7178a..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm +++ /dev/null @@ -1,87 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_convolve_copy_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_convolve_copy_neon| PROC - push {r4-r5, lr} - ldrd r4, r5, [sp, #28] - - cmp r4, #32 - bgt copy64 - beq copy32 - cmp r4, #8 - bgt copy16 - beq copy8 - b copy4 - -copy64 - sub lr, r1, #32 - sub r3, r3, #32 -copy64_h - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0]! - vld1.8 {q2-q3}, [r0], lr - vst1.8 {q0-q1}, [r2@128]! - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #1 - bgt copy64_h - pop {r4-r5, pc} - -copy32 - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {q2-q3}, [r0], r1 - vst1.8 {q0-q1}, [r2@128], r3 - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #2 - bgt copy32 - pop {r4-r5, pc} - -copy16 - pld [r0, r1, lsl #1] - vld1.8 {q0}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {q1}, [r0], r1 - vst1.8 {q0}, [r2@128], r3 - vst1.8 {q1}, [r2@128], r3 - subs r5, r5, #2 - bgt copy16 - pop {r4-r5, pc} - -copy8 - pld [r0, r1, lsl #1] - vld1.8 {d0}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {d2}, [r0], r1 - vst1.8 {d0}, [r2@64], r3 - vst1.8 {d2}, [r2@64], r3 - subs r5, r5, #2 - bgt copy8 - pop {r4-r5, pc} - -copy4 - ldr r12, [r0], r1 - str r12, [r2], r3 - subs r5, r5, #1 - bgt copy4 - pop {r4-r5, pc} - ENDP - - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c deleted file mode 100644 index 6c2997e04..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the - * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). - */ - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - - // Account for the vertical phase needing 3 lines prior and 4 lines post - int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - /* Filter starting 3 lines back. The neon implementation will ignore the - * given height and filter a multiple of 4 lines. Since this goes in to - * the temp buffer which has lots of extra room and is subsequently discarded - * this is safe if somewhat less than ideal. - */ - aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); - aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c deleted file mode 100644 index 6ff760017..000000000 --- a/third_party/aom/aom_dsp/arm/avg_neon.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" - -#include "aom/aom_integer.h" - -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int aom_satd_neon(const int16_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); - - do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); - length -= 16; - coeff += 16; - } while (length != 0); - - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } -} - -void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride, - int height) { - int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); - - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; - } - - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); - - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); -} - -int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) { - int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; - } - - return horizontal_add_u16x8(vec_sum); -} - -// ref, src = [0, 510] - max diff = 16-bits -// bwl = {2, 3, 4}, width = {16, 32, 64} -int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) { - int width = 4 << bwl; - int32x4_t sse = vdupq_n_s32(0); - int16x8_t total = vdupq_n_s16(0); - - assert(width >= 8); - assert((width % 8) == 0); - - do { - const int16x8_t r = vld1q_s16(ref); - const int16x8_t s = vld1q_s16(src); - const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. - sse = vmlal_s16(sse, diff_hi, diff_hi); - total = vaddq_s16(total, diff); // dynamic range 16 bits. - - ref += 8; - src += 8; - width -= 8; - } while (width != 0); - - { - // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired - // with the summation of 'sse' performed better on a Cortex-A15. - const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' - const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); - const int32x2_t t2 = vpadd_s32(t1, t1); - const int t = vget_lane_s32(t2, 0); - const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int s = vget_lane_s32(s1, 0); - const int shift_factor = bwl + 2; - return s - ((t * t) >> shift_factor); - } -} - -void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int *min, int *max) { - // Load and concatenate. - const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); - const uint8x16_t a23 = - vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); - const uint8x16_t a45 = - vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); - const uint8x16_t a67 = - vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); - - const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); - const uint8x16_t b23 = - vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); - const uint8x16_t b45 = - vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); - const uint8x16_t b67 = - vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); - - // Absolute difference. - const uint8x16_t ab01_diff = vabdq_u8(a01, b01); - const uint8x16_t ab23_diff = vabdq_u8(a23, b23); - const uint8x16_t ab45_diff = vabdq_u8(a45, b45); - const uint8x16_t ab67_diff = vabdq_u8(a67, b67); - - // Max values between the Q vectors. - const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); - const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); - const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); - const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); - - const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); - const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - - // Split to D and start doing pairwise. - uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); - uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - - // Enough runs of vpmax/min propogate the max/min values to every position. - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - *min = *max = 0; // Clear high bits - // Store directly to avoid costly neon->gpr transfer. - vst1_lane_u8((uint8_t *)max, ab_max, 0); - vst1_lane_u8((uint8_t *)min, ab_min, 0); -} diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c new file mode 100644 index 000000000..82c0b0e28 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1, + const int16x8_t v_maxval, int16x8_t *res) { + int32x4_t im_res_low, im_res_high; + const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask); + + im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0)); + im_res_low = + vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1)); + + im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0)); + im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask), + vget_high_s16(src_1)); + + *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS), + vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS)); +} + +static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x8_t mask0, int16x8_t mask1, int16x8_t mask2, + int16x8_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1, src0_2, src0_3; + int16x8_t src1_0, src1_1, src1_2, src1_3; + int16x8_t im_res_0, im_res_1, im_res_2, im_res_3; + + load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2, + &src0_3); + load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2, + &src1_3); + + blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0); + blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1); + blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2); + blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3); + + uint16x8_t im_res1_0 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset); + uint16x8_t im_res1_1 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset); + uint16x8_t im_res1_2 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset); + uint16x8_t im_res1_3 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset); + + im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits); + im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits); + im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits); + im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits); + + vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0)); + vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1)); + vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2)); + vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3)); +} + +static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x4_t mask0, int16x4_t mask1, int16x4_t mask2, + int16x4_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1; + int16x8_t src1_0, src1_1; + uint64x2_t tu0, tu1, tu2, tu3; + int16x8_t mask0_1, mask2_3; + int16x8_t res0, res1; + + load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1); + load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3); + + src0_0 = vreinterpretq_s16_u64(tu0); + src0_1 = vreinterpretq_s16_u64(tu1); + + src1_0 = vreinterpretq_s16_u64(tu2); + src1_1 = vreinterpretq_s16_u64(tu3); + + mask0_1 = vcombine_s16(mask0, mask1); + mask2_3 = vcombine_s16(mask2, mask3); + + blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0); + blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1); + + uint16x8_t im_res_0 = + vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset); + uint16x8_t im_res_1 = + vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset); + + src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits); + src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits); + + uint8x8_t res_0 = vqmovun_s16(src0_0); + uint8x8_t res_1 = vqmovun_s16(src0_1); + + vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0), + 0); + vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0), + 1); + vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1), + 0); + vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1), + 1); +} + +void aom_lowbd_blend_a64_d16_mask_neon( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + int i = 0; + const int bd = 8; + int w_tmp = w; + const uint8_t *mask_tmp = mask; + const CONV_BUF_TYPE *src0_tmp = src0; + const CONV_BUF_TYPE *src1_tmp = src1; + uint8_t *dst_tmp = dst; + + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + uint8x8_t s0, s1, s2, s3; + uint32x2_t tu0, tu1, tu2, tu3; + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + int16x8_t mask0, mask1, mask2, mask3; + int16x8_t mask4, mask5, mask6, mask7; + int32x4_t m0_32, m1_32, m2_32, m3_32; + int32x4_t m4_32, m5_32, m6_32, m7_32; + uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l; + uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l; + int16x4_t mask0_low, mask1_low, mask2_low, mask3_low; + const uint16x4_t vec_zero = vdup_n_u16(0); + const uint16_t offset = round_offset - (1 << (round_bits - 1)); + const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA); + const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); + const uint16x8_t vec_offset = vdupq_n_u16(offset); + + if (subw == 0 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3); + + mask0 = vmovl_s8(vreinterpret_s8_u8(s0)); + mask1 = vmovl_s8(vreinterpret_s8_u8(s1)); + mask2 = vmovl_s8(vreinterpret_s8_u8(s2)); + mask3 = vmovl_s8(vreinterpret_s8_u8(s3)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1); + + mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 1) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + + mask0 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1))); + mask1 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3))); + mask2 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5))); + mask3 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7))); + + mask4 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t0), vget_high_u8(t1))); + mask5 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t2), vget_high_u8(t3))); + mask6 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t4), vget_high_u8(t5))); + mask7 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t6), vget_high_u8(t7))); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + m4_32 = vpaddlq_s16(mask4); + m5_32 = vpaddlq_s16(mask5); + m6_32 = vpaddlq_s16(mask6); + m7_32 = vpaddlq_s16(mask7); + + mask0 = + vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2)); + mask1 = + vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2)); + mask2 = + vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2)); + mask3 = + vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + mask0_low = vqrshrn_n_s32(m0_32, 2); + mask1_low = vqrshrn_n_s32(m1_32, 2); + mask2_low = vqrshrn_n_s32(m2_32, 2); + mask3_low = vqrshrn_n_s32(m3_32, 2); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3); + + mask0 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0)))); + mask1 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1)))); + mask2 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2)))); + mask3 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3)))); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l); + + mask0 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero)); + mask1 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero)); + mask2 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero)); + mask3 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero)); + + mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1))); + mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1))); + mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1))); + mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1))); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1); + load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2, + &tu3); + + s0 = vreinterpret_u8_u32(tu0); + s1 = vreinterpret_u8_u32(tu1); + s2 = vreinterpret_u8_u32(tu2); + s3 = vreinterpret_u8_u32(tu3); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } +} diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c index 1cf8a3a6e..e4300c992 100644 --- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c +++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c @@ -11,7 +11,8 @@ #include <arm_neon.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_dsp/txfm_common.h" void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c deleted file mode 100644 index 9baefae47..000000000 --- a/third_party/aom/aom_dsp/arm/hadamard_neon.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" - -static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, - int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - const int16x8_t b0 = vaddq_s16(*a0, *a1); - const int16x8_t b1 = vsubq_s16(*a0, *a1); - const int16x8_t b2 = vaddq_s16(*a2, *a3); - const int16x8_t b3 = vsubq_s16(*a2, *a3); - const int16x8_t b4 = vaddq_s16(*a4, *a5); - const int16x8_t b5 = vsubq_s16(*a4, *a5); - const int16x8_t b6 = vaddq_s16(*a6, *a7); - const int16x8_t b7 = vsubq_s16(*a6, *a7); - - const int16x8_t c0 = vaddq_s16(b0, b2); - const int16x8_t c1 = vaddq_s16(b1, b3); - const int16x8_t c2 = vsubq_s16(b0, b2); - const int16x8_t c3 = vsubq_s16(b1, b3); - const int16x8_t c4 = vaddq_s16(b4, b6); - const int16x8_t c5 = vaddq_s16(b5, b7); - const int16x8_t c6 = vsubq_s16(b4, b6); - const int16x8_t c7 = vsubq_s16(b5, b7); - - *a0 = vaddq_s16(c0, c4); - *a1 = vsubq_s16(c2, c6); - *a2 = vsubq_s16(c0, c4); - *a3 = vaddq_s16(c2, c6); - *a4 = vaddq_s16(c3, c7); - *a5 = vsubq_s16(c3, c7); - *a6 = vsubq_s16(c1, c5); - *a7 = vaddq_s16(c1, c5); -} - -// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider -// reversing transpose order which may make it easier for the compiler to -// reconcile the vtrn.64 moves. -static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, - int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - // Swap 64 bit elements. Goes from: - // a0: 00 01 02 03 04 05 06 07 - // a1: 08 09 10 11 12 13 14 15 - // a2: 16 17 18 19 20 21 22 23 - // a3: 24 25 26 27 28 29 30 31 - // a4: 32 33 34 35 36 37 38 39 - // a5: 40 41 42 43 44 45 46 47 - // a6: 48 49 50 51 52 53 54 55 - // a7: 56 57 58 59 60 61 62 63 - // to: - // a04_lo: 00 01 02 03 32 33 34 35 - // a15_lo: 08 09 10 11 40 41 42 43 - // a26_lo: 16 17 18 19 48 49 50 51 - // a37_lo: 24 25 26 27 56 57 58 59 - // a04_hi: 04 05 06 07 36 37 38 39 - // a15_hi: 12 13 14 15 44 45 46 47 - // a26_hi: 20 21 22 23 52 53 54 55 - // a37_hi: 28 29 30 31 60 61 62 63 - const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); - const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); - const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); - const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); - const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); - const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); - const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); - const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); - - // Swap 32 bit elements resulting in: - // a0246_lo: - // 00 01 16 17 32 33 48 49 - // 02 03 18 19 34 35 50 51 - // a1357_lo: - // 08 09 24 25 40 41 56 57 - // 10 11 26 27 42 43 58 59 - // a0246_hi: - // 04 05 20 21 36 37 52 53 - // 06 07 22 23 38 39 54 55 - // a1657_hi: - // 12 13 28 29 44 45 60 61 - // 14 15 30 31 46 47 62 63 - const int32x4x2_t a0246_lo = - vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); - const int32x4x2_t a1357_lo = - vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); - const int32x4x2_t a0246_hi = - vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); - const int32x4x2_t a1357_hi = - vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); - - // Swap 16 bit elements resulting in: - // b0: - // 00 08 16 24 32 40 48 56 - // 01 09 17 25 33 41 49 57 - // b1: - // 02 10 18 26 34 42 50 58 - // 03 11 19 27 35 43 51 59 - // b2: - // 04 12 20 28 36 44 52 60 - // 05 13 21 29 37 45 53 61 - // b3: - // 06 14 22 30 38 46 54 62 - // 07 15 23 31 39 47 55 63 - const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), - vreinterpretq_s16_s32(a1357_lo.val[0])); - const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), - vreinterpretq_s16_s32(a1357_lo.val[1])); - const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), - vreinterpretq_s16_s32(a1357_hi.val[0])); - const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), - vreinterpretq_s16_s32(a1357_hi.val[1])); - - *a0 = b0.val[0]; - *a1 = b0.val[1]; - *a2 = b1.val[0]; - *a3 = b1.val[1]; - *a4 = b2.val[0]; - *a5 = b2.val[1]; - *a6 = b3.val[0]; - *a7 = b3.val[1]; -} - -void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int16x8_t a0 = vld1q_s16(src_diff); - int16x8_t a1 = vld1q_s16(src_diff + src_stride); - int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); - int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); - int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); - int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); - int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); - int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); - - hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - // Skip the second transpose because it is not required. - - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); -} - -void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int i; - - /* Rearrange 16x16 to 8x32 and remove stride. - * Top left first. */ - aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); - /* Top right. */ - aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); - /* Bottom left. */ - aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); - /* Bottom right. */ - aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); - - for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); - - const int16x8_t b0 = vhaddq_s16(a0, a1); - const int16x8_t b1 = vhsubq_s16(a0, a1); - const int16x8_t b2 = vhaddq_s16(a2, a3); - const int16x8_t b3 = vhsubq_s16(a2, a3); - - const int16x8_t c0 = vaddq_s16(b0, b2); - const int16x8_t c1 = vaddq_s16(b1, b3); - const int16x8_t c2 = vsubq_s16(b0, b2); - const int16x8_t c3 = vsubq_s16(b1, b3); - - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); - - coeff += 8; - } -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c deleted file mode 100644 index 196b2a890..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm deleted file mode 100644 index d01c4bc03..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm +++ /dev/null @@ -1,201 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct16x16_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c deleted file mode 100644 index b4cb7a0cd..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c +++ /dev/null @@ -1,1295 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - d30s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16((int16_t)cospi_24_64); - d31s16 = vdup_n_s16((int16_t)cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d12s16 = vdup_n_s16((int16_t)cospi_30_64); - d13s16 = vdup_n_s16((int16_t)cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16((int16_t)cospi_14_64); - d31s16 = vdup_n_s16((int16_t)cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16((int16_t)cospi_22_64); - d31s16 = vdup_n_s16((int16_t)cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16((int16_t)cospi_6_64); - d31s16 = vdup_n_s16((int16_t)cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = - vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = - vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); - } - return; -} - -void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2)); - q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2)); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - // stage 4 - q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2)); - d4s16 = vdup_n_s16((int16_t)cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2)); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2)); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2)); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm deleted file mode 100644 index 4a8f8f183..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm +++ /dev/null @@ -1,1182 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct16x16_256_add_neon_pass1| - EXPORT |aom_idct16x16_256_add_neon_pass2| - EXPORT |aom_idct16x16_10_add_neon_pass1| - EXPORT |aom_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 - - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass1| - -;void aom_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 - - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 - - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f - - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass2| - -;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 - - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass1| - -;void aom_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c - - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass2| - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c deleted file mode 100644 index db0d4905b..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_neon.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/aom_dsp_common.h" - -void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); -void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void aom_push_neon(int64_t *store); -extern void aom_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - aom_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, - pass1_output, 0, dest, dest_stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - aom_pop_neon(store_reg); -#endif - - return; -} - -void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - aom_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - aom_pop_neon(store_reg); -#endif - - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c deleted file mode 100644 index 547567c5b..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_config.h" - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); - return; -} - -static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); - return; -} - -void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } - } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm deleted file mode 100644 index b04df2d0b..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm +++ /dev/null @@ -1,147 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - EXPORT |aom_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|aom_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |aom_idct32x32_1_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c deleted file mode 100644 index a7562c7d5..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c +++ /dev/null @@ -1,686 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -#define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); - -#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); - -#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); - -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; - - d8s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); - - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); - - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - - vst1_s16((int16_t *)p1, d9s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); - p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); - return; -} - -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ - ; \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; - - d4s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); - - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); - - q5s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); - - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); - - vst1_s16((int16_t *)p1, d5s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); - p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); - return; -} - -#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, - int16_t first_const, int16_t second_const, - int16x8_t *qAs16, int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; - - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); - - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); - - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); - - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); - - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); - return; -} - -static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { - int16_t *in; - int i; - const int stride = 32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - q12s16 = vcombine_s16(d17s16, d25s16); - q13s16 = vcombine_s16(d19s16, d27s16); - q14s16 = vcombine_s16(d21s16, d29s16); - q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - vst1q_s16(t_buf, q0x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q0x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[1]); - t_buf += 8; - } - return; -} - -static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, - int16x8_t q3s16, int16x8_t q6s16, - int16x8_t q7s16, int16x8_t q8s16, - int16x8_t q9s16, int16x8_t q10s16, - int16x8_t q11s16, int16x8_t q12s16, - int16x8_t q13s16, int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); - - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); - - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); - - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); - - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); - - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); - - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); - return; -} - -static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, - int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, - int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, - int16x8_t q14s16, int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest /* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - return; -} - -void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { - int i, idct32_pass_loop; - int16_t trans_buf[32 * 8]; - int16_t pass1[32 * 32]; - int16_t pass2[32 * 32]; - int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - - for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); - - // ----------------------------------------- - // BLOCK A: 16-19,28-31 - // ----------------------------------------- - // generate 16,17,30,31 - // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) - // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) - - // generate 18,19,28,29 - // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) - // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) - // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) - - // ----------------------------------------- - // BLOCK B: 20-23,24-27 - // ----------------------------------------- - // generate 20,21,26,27 - // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - - // generate 22,23,24,25 - // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) - // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) - // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); - // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) - // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); - // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) - - // ----------------------------------------- - // BLOCK C: 8-10,11-15 - // ----------------------------------------- - // generate 8,9,14,15 - // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) - // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) - - // generate 10,11,12,13 - // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) - // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) - // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) - // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) - - // ----------------------------------------- - // BLOCK D: 0-3,4-7 - // ----------------------------------------- - // generate 4,5,6,7 - // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - - // generate 0,1,2,3 - // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) - // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); - // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); - // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - - if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, - q15s16); - } else { - idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, - q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16); - dest += 8; - } - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm deleted file mode 100644 index e7793fb16..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm +++ /dev/null @@ -1,1302 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |aom_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|aom_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |aom_idct32x32_1024_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c deleted file mode 100644 index 3df7a901b..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - q0s16 = vdupq_n_s16(a1); - - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm deleted file mode 100644 index 6bd733d5d..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm +++ /dev/null @@ -1,71 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct4x4_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.s16 q0, r0 ; duplicate a1 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q8, q0, d2 ; dest[x] + a1 - vaddw.u8 q9, q0, d4 - - vqmovun.s16 d6, q8 ; clip_pixel - vqmovun.s16 d7, q9 - - vst1.32 {d6[0]}, [r12], r2 - vst1.32 {d6[1]}, [r12], r2 - vst1.32 {d7[0]}, [r12], r2 - vst1.32 {d7[1]}, [r12] - - bx lr - ENDP ; |aom_idct4x4_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c deleted file mode 100644 index 763be1ab0..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "aom_dsp/txfm_common.h" - -void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - - d26u32 = d27u32 = vdup_n_u32(0); - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - d20s16 = vdup_n_s16((int16_t)cospi_8_64); - d21s16 = vdup_n_s16((int16_t)cospi_16_64); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - d22s16 = vdup_n_s16((int16_t)cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm deleted file mode 100644 index 127acf614..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm +++ /dev/null @@ -1,193 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_16_add_neon| PROC - - ; The 2D transform is done with two passes which are actually pretty - ; similar. We first transform the rows. This is done by transposing - ; the inputs, doing an SIMD column transform (the columns are the - ; transposed rows) and then transpose the results (so that it goes back - ; in normal/row positions). Then, we transform the columns by doing - ; another SIMD column transform. - ; So, two passes of a transpose followed by a column transform. - - ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e - - ; transpose the input data - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - - ; generate constant vectors - vdup.16 d20, r0 ; replicate cospi_8_64 - vdup.16 d21, r3 ; replicate cospi_16_64 - - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - vdup.16 d22, r12 ; replicate cospi_24_64 - - ; do the transform on transposed rows - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - - ; transpose the results - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - ; do the transform on columns - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - - ; The results are in two registers, one of them being swapped. This will - ; be taken care of by loading the 'dest' value in a swapped fashion and - ; also storing them in the same swapped fashion. - ; temp_out[0, 1] = d16, d17 = q8 - ; temp_out[2, 3] = d19, d18 = q9 swapped - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1] ; no post-increment - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |aom_idct4x4_16_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c deleted file mode 100644 index c7926f9e4..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm deleted file mode 100644 index ec07e2053..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm +++ /dev/null @@ -1,91 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct8x8_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c deleted file mode 100644 index 8ad70862d..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16((int16_t)cospi_24_64); - d1s16 = vdup_n_s16((int16_t)cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); - return; -} - -void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} - -void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); - q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm deleted file mode 100644 index f3d5f246d..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm +++ /dev/null @@ -1,522 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct8x8_64_add_neon| - EXPORT |aom_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_64_add_neon| - -;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_12_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 7d5f64004..69470eeb0 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -11,8 +11,9 @@ #include <arm_neon.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" //------------------------------------------------------------------------------ @@ -342,8 +343,6 @@ void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); } -#if !HAVE_NEON_ASM - void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; @@ -529,4 +528,3 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } -#endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm deleted file mode 100644 index fba9c1b5b..000000000 --- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm +++ /dev/null @@ -1,287 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_v_predictor_4x4_neon| - EXPORT |aom_v_predictor_8x8_neon| - EXPORT |aom_v_predictor_16x16_neon| - EXPORT |aom_v_predictor_32x32_neon| - EXPORT |aom_h_predictor_4x4_neon| - EXPORT |aom_h_predictor_8x8_neon| - EXPORT |aom_h_predictor_16x16_neon| - EXPORT |aom_h_predictor_32x32_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_4x4_neon| PROC - vld1.32 {d0[0]}, [r2] - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_4x4_neon| - -;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_8x8_neon| PROC - vld1.8 {d0}, [r2] - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_8x8_neon| - -;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_16x16_neon| PROC - vld1.8 {q0}, [r2] - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_16x16_neon| - -;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_32x32_neon| PROC - vld1.8 {q0, q1}, [r2] - mov r2, #2 -loop_v - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - subs r2, r2, #1 - bgt loop_v - bx lr - ENDP ; |aom_v_predictor_32x32_neon| - -;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_4x4_neon| PROC - vld1.32 {d1[0]}, [r3] - vdup.8 d0, d1[0] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[1] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[2] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[3] - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_4x4_neon| - -;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_8x8_neon| PROC - vld1.64 {d1}, [r3] - vdup.8 d0, d1[0] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[1] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[2] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[3] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[4] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[5] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[6] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[7] - vst1.64 {d0}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_8x8_neon| - -;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_16x16_neon| PROC - vld1.8 {q1}, [r3] - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_16x16_neon| - -;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_32x32_neon| PROC - sub r1, r1, #16 - mov r2, #2 -loop_h - vld1.8 {q1}, [r3]! - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - subs r2, r2, #1 - bgt loop_h - bx lr - ENDP ; |aom_h_predictor_32x32_neon| diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c deleted file mode 100644 index c0562a6ea..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" -#include "aom/aom_integer.h" - -static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit - uint8x16_t qlimit, // limit - uint8x16_t qthresh, // thresh - uint8x16_t q3, // p3 - uint8x16_t q4, // p2 - uint8x16_t q5, // p1 - uint8x16_t q6, // p0 - uint8x16_t q7, // q0 - uint8x16_t q8, // q1 - uint8x16_t q9, // q2 - uint8x16_t q10, // q3 - uint8x16_t *q5r, // p1 - uint8x16_t *q6r, // p0 - uint8x16_t *q7r, // q0 - uint8x16_t *q8r) { // q1 - uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int16x8_t q2s16, q11s16; - uint16x8_t q4u16; - int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; - int8x8_t d2s8, d3s8; - - q11u8 = vabdq_u8(q3, q4); - q12u8 = vabdq_u8(q4, q5); - q13u8 = vabdq_u8(q5, q6); - q14u8 = vabdq_u8(q8, q7); - q3 = vabdq_u8(q9, q8); - q4 = vabdq_u8(q10, q9); - - q11u8 = vmaxq_u8(q11u8, q12u8); - q12u8 = vmaxq_u8(q13u8, q14u8); - q3 = vmaxq_u8(q3, q4); - q15u8 = vmaxq_u8(q11u8, q12u8); - - q9 = vabdq_u8(q6, q7); - - // aom_hevmask - q13u8 = vcgtq_u8(q13u8, qthresh); - q14u8 = vcgtq_u8(q14u8, qthresh); - q15u8 = vmaxq_u8(q15u8, q3); - - q2u8 = vabdq_u8(q5, q8); - q9 = vqaddq_u8(q9, q9); - - q15u8 = vcgeq_u8(qlimit, q15u8); - - // aom_filter() function - // convert to signed - q10 = vdupq_n_u8(0x80); - q8 = veorq_u8(q8, q10); - q7 = veorq_u8(q7, q10); - q6 = veorq_u8(q6, q10); - q5 = veorq_u8(q5, q10); - - q2u8 = vshrq_n_u8(q2u8, 1); - q9 = vqaddq_u8(q9, q2u8); - - q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), - vget_low_s8(vreinterpretq_s8_u8(q6))); - q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), - vget_high_s8(vreinterpretq_s8_u8(q6))); - - q9 = vcgeq_u8(qblimit, q9); - - q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); - - q14u8 = vorrq_u8(q13u8, q14u8); - - q4u16 = vdupq_n_u16(3); - q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); - q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); - - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); - q15u8 = vandq_u8(q15u8, q9); - - q1s8 = vreinterpretq_s8_u8(q1u8); - q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); - q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); - - q4 = vdupq_n_u8(3); - q9 = vdupq_n_u8(4); - // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0)) - d2s8 = vqmovn_s16(q2s16); - d3s8 = vqmovn_s16(q11s16); - q1s8 = vcombine_s8(d2s8, d3s8); - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); - q1s8 = vreinterpretq_s8_u8(q1u8); - - q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); - q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); - q2s8 = vshrq_n_s8(q2s8, 3); - q1s8 = vshrq_n_s8(q1s8, 3); - - q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); - q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); - - q1s8 = vrshrq_n_s8(q1s8, 1); - q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); - - q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); - q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); - - *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); - *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); - *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); - *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); - return; -} - -void aom_lpf_horizontal_4_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; - uint8x16_t qblimit, qlimit, qthresh; - uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; - - dblimit0 = vld1_u8(blimit0); - dlimit0 = vld1_u8(limit0); - dthresh0 = vld1_u8(thresh0); - dblimit1 = vld1_u8(blimit1); - dlimit1 = vld1_u8(limit1); - dthresh1 = vld1_u8(thresh1); - qblimit = vcombine_u8(dblimit0, dblimit1); - qlimit = vcombine_u8(dlimit0, dlimit1); - qthresh = vcombine_u8(dthresh0, dthresh1); - - s -= (p << 2); - - q3u8 = vld1q_u8(s); - s += p; - q4u8 = vld1q_u8(s); - s += p; - q5u8 = vld1q_u8(s); - s += p; - q6u8 = vld1q_u8(s); - s += p; - q7u8 = vld1q_u8(s); - s += p; - q8u8 = vld1q_u8(s); - s += p; - q9u8 = vld1q_u8(s); - s += p; - q10u8 = vld1q_u8(s); - - loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, - q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); - - s -= (p * 5); - vst1q_u8(s, q5u8); - s += p; - vst1q_u8(s, q6u8); - s += p; - vst1q_u8(s, q7u8); - s += p; - vst1q_u8(s, q8u8); - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm deleted file mode 100644 index b6e2c9edb..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm +++ /dev/null @@ -1,202 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_dual_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, -; const uint8_t *blimit0, -; const uint8_t *limit0, -; const uint8_t *thresh0, -; const uint8_t *blimit1, -; const uint8_t *limit1, -; const uint8_t *thresh1) -; r0 uint8_t *s, -; r1 int p, -; r2 const uint8_t *blimit0, -; r3 const uint8_t *limit0, -; sp const uint8_t *thresh0, -; sp+4 const uint8_t *blimit1, -; sp+8 const uint8_t *limit1, -; sp+12 const uint8_t *thresh1, - -|aom_lpf_horizontal_4_dual_neon| PROC - push {lr} - - ldr r12, [sp, #4] ; load thresh0 - vld1.8 {d0}, [r2] ; load blimit0 to first half q - vld1.8 {d2}, [r3] ; load limit0 to first half q - - add r1, r1, r1 ; double pitch - ldr r2, [sp, #8] ; load blimit1 - - vld1.8 {d4}, [r12] ; load thresh0 to first half q - - ldr r3, [sp, #12] ; load limit1 - ldr r12, [sp, #16] ; load thresh1 - vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q - - sub r2, r0, r1, lsl #1 ; s[-4 * p] - - vld1.8 {d3}, [r3] ; load limit1 to 2nd half q - vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q - - vpush {d8-d15} ; save neon registers - - add r3, r2, r1, lsr #1 ; s[-3 * p] - - vld1.u8 {q3}, [r2@64], r1 ; p3 - vld1.u8 {q4}, [r3@64], r1 ; p2 - vld1.u8 {q5}, [r2@64], r1 ; p1 - vld1.u8 {q6}, [r3@64], r1 ; p0 - vld1.u8 {q7}, [r2@64], r1 ; q0 - vld1.u8 {q8}, [r3@64], r1 ; q1 - vld1.u8 {q9}, [r2@64] ; q2 - vld1.u8 {q10}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon_16 - - vst1.u8 {q5}, [r2@64], r1 ; store op1 - vst1.u8 {q6}, [r3@64], r1 ; store op0 - vst1.u8 {q7}, [r2@64], r1 ; store oq0 - vst1.u8 {q8}, [r3@64], r1 ; store oq1 - - vpop {d8-d15} ; restore neon registers - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_dual_neon| - -; void aom_loop_filter_neon_16(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. This function uses -; registers d8-d15, so the calling function must save those registers. -; -; r0-r3, r12 PRESERVE -; q0 blimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -; -; Outputs: -; q5 op1 -; q6 op0 -; q7 oq0 -; q8 oq1 -|aom_loop_filter_neon_16| PROC - - ; filter_mask - vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) - vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) - vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) - vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) - vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) - vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) - vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) - - vmov.u8 q10, #0x80 - - vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) - - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - veor q7, q7, q10 ; qs0 - - vcge.u8 q15, q1, q15 ; abs(m11) > limit - - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u16 q4, #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; a > blimit - - vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; hev - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; filter &= hev - vand q15, q15, q9 ; mask - - vmov.u8 q4, #3 - - vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; filter &= mask - - vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) - vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) - vshr.s8 q2, q2, #3 ; filter2 >>= 3 - vshr.s8 q1, q1, #3 ; filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) - vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 - - veor q7, q0, q10 ; *oq0 = u^0x80 - - vbic q1, q1, q14 ; filter &= ~hev - - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) - - veor q6, q11, q10 ; *op0 = u^0x80 - veor q5, q13, q10 ; *op1 = u^0x80 - veor q8, q12, q10 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon_16| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c deleted file mode 100644 index 2b1f80b81..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" - -static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; -} - -void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; -} - -void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < 1; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm deleted file mode 100644 index 8b54984d5..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm +++ /dev/null @@ -1,252 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_neon| - EXPORT |aom_lpf_vertical_4_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #4] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r3, r2, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r2@64], r1 ; p3 - vld1.u8 {d4}, [r3@64], r1 ; p2 - vld1.u8 {d5}, [r2@64], r1 ; p1 - vld1.u8 {d6}, [r3@64], r1 ; p0 - vld1.u8 {d7}, [r2@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r2@64] ; q2 - vld1.u8 {d18}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon - - vst1.u8 {d4}, [r2@64], r1 ; store op1 - vst1.u8 {d5}, [r3@64], r1 ; store op0 - vst1.u8 {d6}, [r2@64], r1 ; store oq0 - vst1.u8 {d7}, [r3@64], r1 ; store oq1 - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_neon| - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_vertical_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #4] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - bl aom_loop_filter_neon - - sub r0, r0, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 - vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 - vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 - vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 - vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 - vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 - vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 - vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - - pop {pc} - ENDP ; |aom_lpf_vertical_4_neon| - -; void aom_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d4 op1 -; d5 op0 -; d6 oq0 -; d7 oq1 -|aom_loop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d17, d6, d7 ; abs(p0 - q0) - - vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) - - vmov.u8 d18, #0x80 - - vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) - - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) - - vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - - veor d7, d7, d18 ; qs0 - - vcge.u8 d23, d1, d23 ; abs(m1) > limit - - ; filter() function - ; convert to signed - - vshr.u8 d28, d28, #1 ; a = a / 2 - veor d6, d6, d18 ; ps0 - - veor d5, d5, d18 ; ps1 - vqadd.u8 d17, d17, d28 ; a = b + a - - veor d16, d16, d18 ; qs1 - - vmov.u8 d19, #3 - - vsub.s8 d28, d7, d6 ; ( qs0 - ps0) - - vcge.u8 d17, d0, d17 ; a > blimit - - vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) - vorr d22, d21, d22 ; hevmask - - vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) - - vand d27, d27, d22 ; filter &= hev - vand d23, d23, d17 ; filter_mask - - vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d17, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d27, q12 - - vand d27, d27, d23 ; filter &= mask - - vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) - vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) - vshr.s8 d28, d28, #3 ; filter2 >>= 3 - vshr.s8 d27, d27, #3 ; filter1 >>= 3 - - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) - vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 - - veor d6, d26, d18 ; *oq0 = u^0x80 - - vbic d27, d27, d22 ; filter &= ~hev - - vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) - vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) - - veor d5, d19, d18 ; *op0 = u^0x80 - veor d4, d21, d18 ; *op1 = u^0x80 - veor d7, d20, d18 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c deleted file mode 100644 index c4502fdb5..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "./aom_dsp_rtcd.h" - -static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - - d25u8 = vabd_u8(d6u8, d4u8); - - d23u8 = vmax_u8(d23u8, d24u8); - - d26u8 = vabd_u8(d7u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); - - d19u8 = vmax_u8(d19u8, d23u8); - - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); - - d19u8 = vcge_u8(dlimit, d19u8); - - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); - - d23u8 = vshr_n_u8(d23u8, 1); - - d25u8 = vmax_u8(d25u8, d26u8); - - d24u8 = vqadd_u8(d24u8, d23u8); - - d20u8 = vmax_u8(d20u8, d25u8); - - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - - d20u8 = vcge_u8(d23u8, d20u8); - - d19u8 = vand_u8(d19u8, d24u8); - - d23u8 = vcgt_u8(d22u8, dthresh); - - d20u8 = vand_u8(d20u8, d19u8); - - d22u8 = vdup_n_u8(0x80); - - d23u8 = vorr_u8(d21u8, d23u8); - - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); - - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); - - d27u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - q15s16 = vaddw_s8(q15s16, d29s8); - - d29u8 = vdup_n_u8(4); - - d28s8 = vqmovn_s16(q15s16); - - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); - - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } - - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - - q14u16 = vaddw_u8(q14u16, d5u8); - - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - - d30u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - - d31u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - - d23u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - - d22u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - - d6u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - - d7u8 = vqrshrn_n_u16(q14u16, 3); - - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; -} - -void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; -} - -void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < 1; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm deleted file mode 100644 index 9f3db66ee..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm +++ /dev/null @@ -1,428 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_8_neon| - EXPORT |aom_lpf_vertical_8_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #12] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r2, r3, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r3@64], r1 ; p3 - vld1.u8 {d4}, [r2@64], r1 ; p2 - vld1.u8 {d5}, [r3@64], r1 ; p1 - vld1.u8 {d6}, [r2@64], r1 ; p0 - vld1.u8 {d7}, [r3@64], r1 ; q0 - vld1.u8 {d16}, [r2@64], r1 ; q1 - vld1.u8 {d17}, [r3@64] ; q2 - vld1.u8 {d18}, [r2@64], r1 ; q3 - - sub r3, r3, r1, lsl #1 - sub r2, r2, r1, lsl #2 - - bl aom_mbloop_filter_neon - - vst1.u8 {d0}, [r2@64], r1 ; store op2 - vst1.u8 {d1}, [r3@64], r1 ; store op1 - vst1.u8 {d2}, [r2@64], r1 ; store op0 - vst1.u8 {d3}, [r3@64], r1 ; store oq0 - vst1.u8 {d4}, [r2@64], r1 ; store oq1 - vst1.u8 {d5}, [r3@64], r1 ; store oq2 - - pop {r4-r5, pc} - - ENDP ; |aom_lpf_horizontal_8_neon| - -; void aom_lpf_vertical_8_neon(uint8_t *s, -; int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #12] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - sub r2, r0, #3 - add r3, r0, #1 - - bl aom_mbloop_filter_neon - - ;store op2, op1, op0, oq0 - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 - vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 - vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 - vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 - vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 - vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 - vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 - vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] - - ;store oq1, oq2 - vst2.8 {d4[0], d5[0]}, [r3], r1 - vst2.8 {d4[1], d5[1]}, [r3], r1 - vst2.8 {d4[2], d5[2]}, [r3], r1 - vst2.8 {d4[3], d5[3]}, [r3], r1 - vst2.8 {d4[4], d5[4]}, [r3], r1 - vst2.8 {d4[5], d5[5]}, [r3], r1 - vst2.8 {d4[6], d5[6]}, [r3], r1 - vst2.8 {d4[7], d5[7]}, [r3] - - pop {r4-r5, pc} - ENDP ; |aom_lpf_vertical_8_neon| - -; void aom_mbloop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d0 op2 -; d1 op1 -; d2 op0 -; d3 oq0 -; d4 oq1 -; d5 oq2 -|aom_mbloop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) - - vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) - - vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) - - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) - vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) - vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d1, d19 - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) - vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) - - vshr.u8 d23, d23, #1 ; a = a / 2 - - vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) - - vqadd.u8 d24, d24, d23 ; a = b + a - - vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) - - vmov.u8 d23, #1 - vcge.u8 d24, d0, d24 ; a > blimit - - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - - vcge.u8 d20, d23, d20 ; flat - - vand d19, d19, d24 ; mask - - vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - - vand d20, d20, d19 ; flat & mask - - vmov.u8 d22, #0x80 - - vorr d23, d21, d23 ; hev - - ; This instruction will truncate the "flat & mask" masks down to 4 bits - ; each to fit into one 32 bit arm register. The values are stored in - ; q10.64[0]. - vshrn.u16 d30, q10, #4 - vmov.u32 r4, d30[0] ; flat & mask 4bits - - adds r5, r4, #1 ; Check for all 1's - - ; If mask and flat are 1's for all vectors, then we only need to execute - ; the power branch for all vectors. - beq power_branch_only - - cmp r4, #0 ; Check for 0, set flag for later - - ; mbfilter() function - ; filter() function - ; convert to signed - veor d21, d7, d22 ; qs0 - veor d24, d6, d22 ; ps0 - veor d25, d5, d22 ; ps1 - veor d26, d16, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d21, d24 ; ( qs0 - ps0) - - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - - vand d29, d29, d23 ; filter &= hev - - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d23 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - ; If mask and flat are 0's for all vectors, then we only need to execute - ; the filter branch for all vectors. - beq filter_branch_only - - ; If mask and flat are mixed then we must perform both branches and - ; combine the data. - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d21, d21, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - ; At this point we have already executed the filter branch. The filter - ; branch does not set op2 or oq2, so use p2 and q2. Execute the power - ; branch and combine the data. - vmov.u8 d23, #2 - vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 - vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 - - vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) - - vaddw.u8 q14, d5 ; r_op2 += p1 - - vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) - - vqrshrn.u16 d30, q14, #3 ; r_op2 - - vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 - vsubw.u8 q14, d4 ; r_op1 -= p2 - vaddw.u8 q14, d5 ; r_op1 += p1 - vaddw.u8 q14, d16 ; r_op1 += q1 - - vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) - - vqrshrn.u16 d31, q14, #3 ; r_op1 - - vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 - vsubw.u8 q14, d5 ; r_op0 -= p1 - vaddw.u8 q14, d6 ; r_op0 += p0 - vaddw.u8 q14, d17 ; r_op0 += q2 - - vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) - - vqrshrn.u16 d23, q14, #3 ; r_op0 - - vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 - vsubw.u8 q14, d6 ; r_oq0 -= p0 - vaddw.u8 q14, d7 ; r_oq0 += q0 - - vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) - - vaddw.u8 q14, d18 ; oq0 += q3 - - vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) - - vqrshrn.u16 d22, q14, #3 ; r_oq0 - - vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 - vsubw.u8 q14, d7 ; r_oq1 -= q0 - vaddw.u8 q14, d16 ; r_oq1 += q1 - - vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) - - vaddw.u8 q14, d18 ; r_oq1 += q3 - - vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) - - vqrshrn.u16 d6, q14, #3 ; r_oq1 - - vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 - vsubw.u8 q14, d16 ; r_oq2 -= q1 - vaddw.u8 q14, d17 ; r_oq2 += q2 - vaddw.u8 q14, d18 ; r_oq2 += q3 - - vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) - - vqrshrn.u16 d7, q14, #3 ; r_oq2 - - vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) - vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) - vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) - - bx lr - -power_branch_only - vmov.u8 d27, #3 - vmov.u8 d21, #2 - vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 - vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 - vaddw.u8 q14, d5 ; op2 += p1 - vqrshrn.u16 d0, q14, #3 ; op2 - - vsubw.u8 q14, d3 ; op1 = op2 - p3 - vsubw.u8 q14, d4 ; op1 -= p2 - vaddw.u8 q14, d5 ; op1 += p1 - vaddw.u8 q14, d16 ; op1 += q1 - vqrshrn.u16 d1, q14, #3 ; op1 - - vsubw.u8 q14, d3 ; op0 = op1 - p3 - vsubw.u8 q14, d5 ; op0 -= p1 - vaddw.u8 q14, d6 ; op0 += p0 - vaddw.u8 q14, d17 ; op0 += q2 - vqrshrn.u16 d2, q14, #3 ; op0 - - vsubw.u8 q14, d3 ; oq0 = op0 - p3 - vsubw.u8 q14, d6 ; oq0 -= p0 - vaddw.u8 q14, d7 ; oq0 += q0 - vaddw.u8 q14, d18 ; oq0 += q3 - vqrshrn.u16 d3, q14, #3 ; oq0 - - vsubw.u8 q14, d4 ; oq1 = oq0 - p2 - vsubw.u8 q14, d7 ; oq1 -= q0 - vaddw.u8 q14, d16 ; oq1 += q1 - vaddw.u8 q14, d18 ; oq1 += q3 - vqrshrn.u16 d4, q14, #3 ; oq1 - - vsubw.u8 q14, d5 ; oq2 = oq1 - p1 - vsubw.u8 q14, d16 ; oq2 -= q1 - vaddw.u8 q14, d17 ; oq2 += q2 - vaddw.u8 q14, d18 ; oq2 += q3 - vqrshrn.u16 d5, q14, #3 ; oq2 - - bx lr - -filter_branch_only - ; TODO(fgalligan): See if we can rearange registers so we do not need to - ; do the 2 vswp. - vswp d0, d4 ; op2 - vswp d5, d17 ; oq2 - veor d2, d24, d22 ; *op0 = u^0x80 - veor d3, d21, d22 ; *oq0 = u^0x80 - veor d1, d25, d22 ; *op1 = u^0x80 - veor d4, d26, d22 ; *oq1 = u^0x80 - - bx lr - - ENDP ; |aom_mbloop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm deleted file mode 100644 index 675928860..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm +++ /dev/null @@ -1,638 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_edge_8_neon| - EXPORT |aom_lpf_horizontal_edge_16_neon| - EXPORT |aom_lpf_vertical_16_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void mb_lpf_horizontal_edge(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; r12 int count -|mb_lpf_horizontal_edge| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - -h_count - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d0}, [r8@64], r1 ; p7 - vld1.u8 {d1}, [r8@64], r1 ; p6 - vld1.u8 {d2}, [r8@64], r1 ; p5 - vld1.u8 {d3}, [r8@64], r1 ; p4 - vld1.u8 {d4}, [r8@64], r1 ; p3 - vld1.u8 {d5}, [r8@64], r1 ; p2 - vld1.u8 {d6}, [r8@64], r1 ; p1 - vld1.u8 {d7}, [r8@64], r1 ; p0 - vld1.u8 {d8}, [r8@64], r1 ; q0 - vld1.u8 {d9}, [r8@64], r1 ; q1 - vld1.u8 {d10}, [r8@64], r1 ; q2 - vld1.u8 {d11}, [r8@64], r1 ; q3 - vld1.u8 {d12}, [r8@64], r1 ; q4 - vld1.u8 {d13}, [r8@64], r1 ; q5 - vld1.u8 {d14}, [r8@64], r1 ; q6 - vld1.u8 {d15}, [r8@64], r1 ; q7 - - bl aom_wide_mbfilter_neon - - tst r7, #1 - beq h_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r8, r0, r1, lsl #1 - - vst1.u8 {d25}, [r8@64], r1 ; store op1 - vst1.u8 {d24}, [r8@64], r1 ; store op0 - vst1.u8 {d23}, [r8@64], r1 ; store oq0 - vst1.u8 {d26}, [r8@64], r1 ; store oq1 - - b h_next - -h_mbfilter - tst r7, #2 - beq h_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, r1, lsl #1 - sub r8, r8, r1 - - vst1.u8 {d18}, [r8@64], r1 ; store op2 - vst1.u8 {d19}, [r8@64], r1 ; store op1 - vst1.u8 {d20}, [r8@64], r1 ; store op0 - vst1.u8 {d21}, [r8@64], r1 ; store oq0 - vst1.u8 {d22}, [r8@64], r1 ; store oq1 - vst1.u8 {d23}, [r8@64], r1 ; store oq2 - - b h_next - -h_wide_mbfilter - sub r8, r0, r1, lsl #3 - add r8, r8, r1 - - vst1.u8 {d16}, [r8@64], r1 ; store op6 - vst1.u8 {d24}, [r8@64], r1 ; store op5 - vst1.u8 {d25}, [r8@64], r1 ; store op4 - vst1.u8 {d26}, [r8@64], r1 ; store op3 - vst1.u8 {d27}, [r8@64], r1 ; store op2 - vst1.u8 {d18}, [r8@64], r1 ; store op1 - vst1.u8 {d19}, [r8@64], r1 ; store op0 - vst1.u8 {d20}, [r8@64], r1 ; store oq0 - vst1.u8 {d21}, [r8@64], r1 ; store oq1 - vst1.u8 {d22}, [r8@64], r1 ; store oq2 - vst1.u8 {d23}, [r8@64], r1 ; store oq3 - vst1.u8 {d1}, [r8@64], r1 ; store oq4 - vst1.u8 {d2}, [r8@64], r1 ; store oq5 - vst1.u8 {d3}, [r8@64], r1 ; store oq6 - -h_next - add r0, r0, #8 - subs r12, r12, #1 - bne h_count - - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |mb_lpf_horizontal_edge| - -; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|aom_lpf_horizontal_edge_8_neon| PROC - mov r12, #1 - b mb_lpf_horizontal_edge - ENDP ; |aom_lpf_horizontal_edge_8_neon| - -; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|aom_lpf_horizontal_edge_16_neon| PROC - mov r12, #2 - b mb_lpf_horizontal_edge - ENDP ; |aom_lpf_horizontal_edge_16_neon| - -; void aom_lpf_vertical_16_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_16_neon| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, #8 - - vld1.8 {d0}, [r8@64], r1 - vld1.8 {d8}, [r0@64], r1 - vld1.8 {d1}, [r8@64], r1 - vld1.8 {d9}, [r0@64], r1 - vld1.8 {d2}, [r8@64], r1 - vld1.8 {d10}, [r0@64], r1 - vld1.8 {d3}, [r8@64], r1 - vld1.8 {d11}, [r0@64], r1 - vld1.8 {d4}, [r8@64], r1 - vld1.8 {d12}, [r0@64], r1 - vld1.8 {d5}, [r8@64], r1 - vld1.8 {d13}, [r0@64], r1 - vld1.8 {d6}, [r8@64], r1 - vld1.8 {d14}, [r0@64], r1 - vld1.8 {d7}, [r8@64], r1 - vld1.8 {d15}, [r0@64], r1 - - sub r0, r0, r1, lsl #3 - - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - vtrn.8 d4, d5 - vtrn.8 d6, d7 - - vtrn.8 d8, d9 - vtrn.8 d10, d11 - vtrn.8 d12, d13 - vtrn.8 d14, d15 - - bl aom_wide_mbfilter_neon - - tst r7, #1 - beq v_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r8, r0, #2 - - vswp d23, d25 - - vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 - vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 - vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 - vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 - vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 - vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 - vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 - vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 - - b v_end - -v_mbfilter - tst r7, #2 - beq v_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, #3 - - vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 - vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 - vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 - vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 - vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 - vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 - vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 - vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 - vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 - vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 - vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 - vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 - vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 - vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 - vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 - vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 - - b v_end - -v_wide_mbfilter - sub r8, r0, #8 - - vtrn.32 d0, d26 - vtrn.32 d16, d27 - vtrn.32 d24, d18 - vtrn.32 d25, d19 - - vtrn.16 d0, d24 - vtrn.16 d16, d25 - vtrn.16 d26, d18 - vtrn.16 d27, d19 - - vtrn.8 d0, d16 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - vtrn.8 d18, d19 - - vtrn.32 d20, d1 - vtrn.32 d21, d2 - vtrn.32 d22, d3 - vtrn.32 d23, d15 - - vtrn.16 d20, d22 - vtrn.16 d21, d23 - vtrn.16 d1, d3 - vtrn.16 d2, d15 - - vtrn.8 d20, d21 - vtrn.8 d22, d23 - vtrn.8 d1, d2 - vtrn.8 d3, d15 - - vst1.8 {d0}, [r8@64], r1 - vst1.8 {d20}, [r0@64], r1 - vst1.8 {d16}, [r8@64], r1 - vst1.8 {d21}, [r0@64], r1 - vst1.8 {d24}, [r8@64], r1 - vst1.8 {d22}, [r0@64], r1 - vst1.8 {d25}, [r8@64], r1 - vst1.8 {d23}, [r0@64], r1 - vst1.8 {d26}, [r8@64], r1 - vst1.8 {d1}, [r0@64], r1 - vst1.8 {d27}, [r8@64], r1 - vst1.8 {d2}, [r0@64], r1 - vst1.8 {d18}, [r8@64], r1 - vst1.8 {d3}, [r0@64], r1 - vst1.8 {d19}, [r8@64], r1 - vst1.8 {d15}, [r0@64], r1 - -v_end - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |aom_lpf_vertical_16_neon| - -; void aom_wide_mbfilter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. -; -; r0-r3 PRESERVE -; d16 blimit -; d17 limit -; d18 thresh -; d0 p7 -; d1 p6 -; d2 p5 -; d3 p4 -; d4 p3 -; d5 p2 -; d6 p1 -; d7 p0 -; d8 q0 -; d9 q1 -; d10 q2 -; d11 q3 -; d12 q4 -; d13 q5 -; d14 q6 -; d15 q7 -|aom_wide_mbfilter_neon| PROC - mov r7, #0 - - ; filter_mask - vabd.u8 d19, d4, d5 ; abs(p3 - p2) - vabd.u8 d20, d5, d6 ; abs(p2 - p1) - vabd.u8 d21, d6, d7 ; abs(p1 - p0) - vabd.u8 d22, d9, d8 ; abs(q1 - q0) - vabd.u8 d23, d10, d9 ; abs(q2 - q1) - vabd.u8 d24, d11, d10 ; abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) - vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) - vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d7, d8 ; abs(p0 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d17, d19 - - ; flatmask4 - vabd.u8 d25, d7, d5 ; abs(p0 - p2) - vabd.u8 d26, d8, d10 ; abs(q0 - q2) - vabd.u8 d27, d4, d7 ; abs(p3 - p0) - vabd.u8 d28, d11, d8 ; abs(q3 - q0) - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) - vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) - vmax.u8 d25, d25, d26 - vmax.u8 d20, d20, d25 - - vshr.u8 d23, d23, #1 ; a = a / 2 - vqadd.u8 d24, d24, d23 ; a = b + a - - vmov.u8 d30, #1 - vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 - - vcge.u8 d20, d30, d20 ; flat - - vand d19, d19, d24 ; mask - - ; hevmask - vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 - vorr d21, d21, d22 ; hev - - vand d16, d20, d19 ; flat && mask - vmov r5, r6, d16 - - ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) - vabd.u8 d22, d3, d7 ; abs(p4 - p0) - vabd.u8 d23, d12, d8 ; abs(q4 - q0) - vabd.u8 d24, d7, d2 ; abs(p0 - p5) - vabd.u8 d25, d8, d13 ; abs(q0 - q5) - vabd.u8 d26, d1, d7 ; abs(p6 - p0) - vabd.u8 d27, d14, d8 ; abs(q6 - q0) - vabd.u8 d28, d0, d7 ; abs(p7 - p0) - vabd.u8 d29, d15, d8 ; abs(q7 - q0) - - ; only compare the largest value to thresh - vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) - vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) - vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) - vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) - - vmax.u8 d26, d22, d23 - vmax.u8 d27, d24, d25 - vmax.u8 d23, d26, d27 - - vcge.u8 d18, d30, d23 ; flat2 - - vmov.u8 d22, #0x80 - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #1 ; Only do filter branch - - vand d17, d18, d16 ; flat2 && flat && mask - vmov r5, r6, d17 - - ; mbfilter() function - - ; filter() function - ; convert to signed - veor d23, d8, d22 ; qs0 - veor d24, d7, d22 ; ps0 - veor d25, d6, d22 ; ps1 - veor d26, d9, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d23, d24 ; ( qs0 - ps0) - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - vand d29, d29, d21 ; filter &= hev - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d21 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d23, d23, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - tst r7, #1 - bxne lr - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #2 ; Only do mbfilter branch - - ; mbfilter flat && mask branch - ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's - ; and using vibt on the q's? - vmov.u8 d29, #2 - vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 - vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 - vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 - vaddl.u8 q10, d4, d5 - vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 - vaddl.u8 q14, d6, d9 - vqrshrn.u16 d18, q15, #3 ; r_op2 - - vsub.i16 q15, q10 - vaddl.u8 q10, d4, d6 - vadd.i16 q15, q14 - vaddl.u8 q14, d7, d10 - vqrshrn.u16 d19, q15, #3 ; r_op1 - - vsub.i16 q15, q10 - vadd.i16 q15, q14 - vaddl.u8 q14, d8, d11 - vqrshrn.u16 d20, q15, #3 ; r_op0 - - vsubw.u8 q15, d4 ; oq0 = op0 - p3 - vsubw.u8 q15, d7 ; oq0 -= p0 - vadd.i16 q15, q14 - vaddl.u8 q14, d9, d11 - vqrshrn.u16 d21, q15, #3 ; r_oq0 - - vsubw.u8 q15, d5 ; oq1 = oq0 - p2 - vsubw.u8 q15, d8 ; oq1 -= q0 - vadd.i16 q15, q14 - vaddl.u8 q14, d10, d11 - vqrshrn.u16 d22, q15, #3 ; r_oq1 - - vsubw.u8 q15, d6 ; oq2 = oq0 - p1 - vsubw.u8 q15, d9 ; oq2 -= q1 - vadd.i16 q15, q14 - vqrshrn.u16 d27, q15, #3 ; r_oq2 - - ; Filter does not set op2 or oq2, so use p2 and q2. - vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) - vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) - vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) - vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) - vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) - - vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) - vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) - - tst r7, #2 - bxne lr - - ; wide_mbfilter flat2 && flat && mask branch - vmov.u8 d16, #7 - vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 - vaddl.u8 q12, d2, d3 - vaddl.u8 q13, d4, d5 - vaddl.u8 q14, d1, d6 - vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 - vadd.i16 q12, q13 - vadd.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vadd.i16 q15, q12 - vaddl.u8 q12, d0, d1 - vaddw.u8 q15, d1 - vaddl.u8 q13, d0, d2 - vadd.i16 q14, q15, q14 - vqrshrn.u16 d16, q15, #4 ; w_op6 - - vsub.i16 q15, q14, q12 - vaddl.u8 q14, d3, d10 - vqrshrn.u16 d24, q15, #4 ; w_op5 - - vsub.i16 q15, q13 - vaddl.u8 q13, d0, d3 - vadd.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vqrshrn.u16 d25, q15, #4 ; w_op4 - - vadd.i16 q15, q14 - vaddl.u8 q14, d0, d4 - vsub.i16 q15, q13 - vsub.i16 q14, q15, q14 - vqrshrn.u16 d26, q15, #4 ; w_op3 - - vaddw.u8 q15, q14, d5 ; op2 += p2 - vaddl.u8 q14, d0, d5 - vaddw.u8 q15, d12 ; op2 += q4 - vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) - vqrshrn.u16 d27, q15, #4 ; w_op2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d6 - vaddw.u8 q15, d6 ; op1 += p1 - vaddw.u8 q15, d13 ; op1 += q5 - vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) - vqrshrn.u16 d18, q15, #4 ; w_op1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d7 - vaddw.u8 q15, d7 ; op0 += p0 - vaddw.u8 q15, d14 ; op0 += q6 - vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) - vqrshrn.u16 d19, q15, #4 ; w_op0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d1, d8 - vaddw.u8 q15, d8 ; oq0 += q0 - vaddw.u8 q15, d15 ; oq0 += q7 - vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) - vqrshrn.u16 d20, q15, #4 ; w_oq0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vaddw.u8 q15, d9 ; oq1 += q1 - vaddl.u8 q4, d10, d15 - vaddw.u8 q15, d15 ; oq1 += q7 - vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) - vqrshrn.u16 d21, q15, #4 ; w_oq1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d3, d10 - vadd.i16 q15, q4 - vaddl.u8 q4, d11, d15 - vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) - vqrshrn.u16 d22, q15, #4 ; w_oq2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vadd.i16 q15, q4 - vaddl.u8 q4, d12, d15 - vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) - vqrshrn.u16 d23, q15, #4 ; w_oq3 - - vsub.i16 q15, q14 - vaddl.u8 q14, d5, d12 - vadd.i16 q15, q4 - vaddl.u8 q4, d13, d15 - vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) - vqrshrn.u16 d1, q15, #4 ; w_oq4 - - vsub.i16 q15, q14 - vaddl.u8 q14, d6, d13 - vadd.i16 q15, q4 - vaddl.u8 q4, d14, d15 - vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) - vqrshrn.u16 d2, q15, #4 ; w_oq5 - - vsub.i16 q15, q14 - vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) - vadd.i16 q15, q4 - vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) - vqrshrn.u16 d3, q15, #4 ; w_oq6 - vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) - vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) - vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) - - bx lr - ENDP ; |aom_wide_mbfilter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c index c90d6bfde..ee1a3c78f 100644 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2018, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -11,39 +11,690 @@ #include <arm_neon.h> -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p3q3, p2q2); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, + uint8x8_t p1q1, uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask3 values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p2q2, p1q1); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, + uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, + out_f14_pq5; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8, flat2_8x8; + uint8x8_t q0p0, q1p1, q2p2; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + // filter 14 + uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; + uint16x8_t p6q6_2, p6q6_temp, qp_sum; + uint8x8_t qp_rev; + + out = vaddw_u8(out, *p4q4); + out = vaddw_u8(out, *p5q5); + out = vaddw_u8(out, *p6q6); + + out_pq5 = vaddw_u8(out, *p4q4); + out_pq4 = vaddw_u8(out_pq5, *p3q3); + out_pq3 = vaddw_u8(out_pq4, *p2q2); + + out_pq5 = vaddw_u8(out_pq5, *p5q5); + out_pq4 = vaddw_u8(out_pq4, *p5q5); + + out_pq0 = vaddw_u8(out, *p1q1); + out_pq1 = vaddw_u8(out_pq0, *p2q2); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + + out_pq0 = vaddw_u8(out_pq0, *p0q0); + out_pq1 = vaddw_u8(out_pq1, *p0q0); + + out_pq1 = vaddw_u8(out_pq1, *p6q6); + p6q6_2 = vaddl_u8(*p6q6, *p6q6); + out_pq2 = vaddq_u16(out_pq2, p6q6_2); + p6q6_temp = vaddw_u8(p6q6_2, *p6q6); + out_pq3 = vaddq_u16(out_pq3, p6q6_temp); + p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); + out_pq4 = vaddq_u16(out_pq4, p6q6_temp); + p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); + out_pq5 = vaddq_u16(out_pq5, p6q6_temp); + + out_pq4 = vaddw_u8(out_pq4, q1p1); + + qp_sum = vaddl_u8(q2p2, q1p1); + out_pq3 = vaddq_u16(out_pq3, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq2 = vaddq_u16(out_pq2, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq1 = vaddq_u16(out_pq1, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq0 = vaddq_u16(out_pq0, qp_sum); + + out_pq0 = vaddw_u8(out_pq0, q0p0); + + out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); + out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); + out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); + out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); + out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); + out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); + } + { + uint8x8_t filter4_cond, filter8_cond, filter14_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + filter14_cond = vand_u8(filter8_cond, flat2_8x8); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + + // filter14 outputs + *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); + *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); + *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); + *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); + *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); + *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); + } +} + +static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); -void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + uint8x8_t q0p0, q1p1, q2p2; + + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + uint8x8_t filter4_cond, filter8_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } +} + +static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f6_pq0, out_f6_pq1; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 6 + uint16x8_t out_pq0, out_pq1; + uint8x8_t pq_rev; + + out = vaddl_u8(*p0q0, *p1q1); + out = vaddq_u16(out, out); + out = vaddw_u8(out, *p2q2); + + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + out = vaddw_u8(out, pq_rev); + + out_pq0 = vaddw_u8(out, pq_rev); + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + out_pq0 = vaddw_u8(out_pq0, pq_rev); + + out_pq1 = vaddw_u8(out, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p2q2); + + out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); + } + { + uint8x8_t filter4_cond, filter6_cond; + filter6_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter6_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter6 outputs + *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); + *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); + } +} + +void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x16_t row0, row1, row2, row3; + uint8x8_t pxp3, p6p2, p5p1, p4p0; + uint8x8_t q0q4, q1q5, q2q6, q3qy; + uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; + uint32x2_t pq_rev; + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; + + // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3); + + pxp3 = vget_low_u8(row0); + p6p2 = vget_low_u8(row1); + p5p1 = vget_low_u8(row2); + p4p0 = vget_low_u8(row3); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + q0q4 = vget_high_u8(row0); + q1q5 = vget_high_u8(row1); + q2q6 = vget_high_u8(row2); + q3qy = vget_high_u8(row3); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); + pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); + + p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); + + pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); + p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); + p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); + p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); + + q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); + p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + row0 = vcombine_u8(pxp3, q0q4); + row1 = vcombine_u8(p6p2, q1q5); + row2 = vcombine_u8(p5p1, q2q6); + row3 = vcombine_u8(p4p0, q3qy); + + store_u8_8x16(src - 8, stride, row0, row1, row2, row3); } -#if HAVE_NEON_ASM -void aom_lpf_horizontal_8_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, p3q3_p0q0; + uint32x2_t pq_rev; + uint8x8_t p3q0, p2q1, p1q2, p0q3; + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + // row0: p3 p2 p1 p0 | q0 q1 q2 q3 + // row1: p3 p2 p1 p0 | q0 q1 q2 q3 + // row2: p3 p2 p1 p0 | q0 q1 q2 q3 + // row3: p3 p2 p1 p0 | q0 q1 q2 q3 + load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); + + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); } -void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), + vreinterpret_u32_u8(p3q3), 1)); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); + vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); } -void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh); - aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2; + + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); } -#endif // HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c index a1eeaf4b7..606950ab2 100644 --- a/third_party/aom/aom_dsp/arm/sad4d_neon.c +++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c @@ -11,8 +11,9 @@ #include <arm_neon.h> -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c index 2f452f55b..a39de91d6 100644 --- a/third_party/aom/aom_dsp/arm/sad_neon.c +++ b/third_party/aom/aom_dsp/arm/sad_neon.c @@ -11,7 +11,7 @@ #include <arm_neon.h> -#include "./aom_config.h" +#include "config/aom_config.h" #include "aom/aom_integer.h" diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm deleted file mode 100644 index e04969823..000000000 --- a/third_party/aom/aom_dsp/arm/save_reg_neon.asm +++ /dev/null @@ -1,39 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_push_neon| - EXPORT |aom_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|aom_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c index 064b72d6f..44d821821 100644 --- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -10,8 +10,9 @@ */ #include <arm_neon.h> -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c index cb8a2daf8..28f5ace8e 100644 --- a/third_party/aom/aom_dsp/arm/subtract_neon.c +++ b/third_party/aom/aom_dsp/arm/subtract_neon.c @@ -11,7 +11,8 @@ #include <arm_neon.h> -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" void aom_subtract_block_neon(int rows, int cols, int16_t *diff, diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c index dbab287e3..74385a601 100644 --- a/third_party/aom/aom_dsp/arm/variance_neon.c +++ b/third_party/aom/aom_dsp/arm/variance_neon.c @@ -11,8 +11,8 @@ #include <arm_neon.h> -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" |