diff options
Diffstat (limited to 'media/libvpx/vp8/encoder/arm')
-rw-r--r-- | media/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm | 262 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm | 212 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/dct_arm.c | 22 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/neon/denoising_neon.c | 478 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c | 89 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c | 269 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/neon/subtract_neon.c | 154 | ||||
-rw-r--r-- | media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c | 129 |
8 files changed, 1615 insertions, 0 deletions
diff --git a/media/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/media/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm new file mode 100644 index 000000000..8034c1db9 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm @@ -0,0 +1,262 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_fdct4x4_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY +; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) +|vp8_short_fdct4x4_armv6| PROC + + stmfd sp!, {r4 - r12, lr} + + ; PART 1 + + ; coeffs 0-3 + ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] + + ldr r10, c7500 + ldr r11, c14500 + ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] + ldr lr, c0x00080008 + ror r5, r5, #16 ; [i2 | i3] + + qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift + qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 + smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 + + smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] + + pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 + pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] + + str r6, [r1, #4] + + ; coeffs 4-7 + ror r9, r9, #16 ; [i6 | i7] + + qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift + qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 + smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 + + smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] + + pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 + pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] + + str r6, [r1, #12] + + ; coeffs 8-11 + ror r5, r5, #16 ; [i10 | i11] + + qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift + qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift + + add r0, r0, r2 ; update input pointer + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 + smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 + + smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) + + ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] + + pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 + pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] + + str r6, [r1, #20] + + ; coeffs 12-15 + ror r5, r5, #16 ; [i14 | i15] + + qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift + qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift + + qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd + ; with 2217*4 and 5352*4 without losing the + ; sign bit (overflow) + + smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 + smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 + + smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) + smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) + + pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 + pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] + + str r6, [r1, #28] + + + ; PART 2 ------------------------------------------------- + ldr r11, c12000 + ldr r10, c51000 + ldr lr, c0x00070007 + + qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] + qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] + qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] + qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] + + qadd16 r4, r4, lr ; a1 + 7 + + add r0, r11, #0x10000 ; add (d!=0) + + qadd16 r2, r4, r5 ; a1 + b1 + 7 + qsub16 r3, r4, r5 ; a1 - b1 + 7 + + ldr r12, c0x08a914e8 ; [2217 | 5352] + + lsl r8, r2, #16 ; prepare bottom halfword for scaling + asr r2, r2, #4 ; scale top halfword + lsl r9, r3, #16 ; prepare bottom halfword for scaling + asr r3, r3, #4 ; scale top halfword + pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword + pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword + + smulbt r2, r6, r12 ; [ ------ | c1*2217] + str r4, [r1, #0] ; [ o1 | o0] + smultt r3, r6, r12 ; [c1*2217 | ------ ] + str r5, [r1, #16] ; [ o9 | o8] + + smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] + smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] + + smulbb r2, r6, r12 ; [ ------ | c1*5352] + smultb r3, r6, r12 ; [c1*5352 | ------ ] + + lsls r6, r7, #16 ; d1 != 0 ? + addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) + addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) + asrs r6, r7, #16 + addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) + addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) + + smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 + smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 + + pkhtb r9, r9, r8, asr #16 + + sub r4, r4, r2 + sub r5, r5, r3 + + ldr r3, [r1, #4] ; [i3 | i2] + + pkhtb r5, r5, r4, asr #16 ; [o13|o12] + + str r9, [r1, #8] ; [o5 | 04] + + ldr r9, [r1, #12] ; [i7 | i6] + ldr r8, [r1, #28] ; [i15|i14] + ldr r2, [r1, #20] ; [i11|i10] + str r5, [r1, #24] ; [o13|o12] + + qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] + qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] + + qadd16 r4, r4, lr ; a1 + 7 + + qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] + qadd16 r2, r4, r5 ; a1 + b1 + 7 + qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] + qsub16 r3, r4, r5 ; a1 - b1 + 7 + + lsl r8, r2, #16 ; prepare bottom halfword for scaling + asr r2, r2, #4 ; scale top halfword + lsl r9, r3, #16 ; prepare bottom halfword for scaling + asr r3, r3, #4 ; scale top halfword + pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword + pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword + + smulbt r2, r6, r12 ; [ ------ | c1*2217] + str r4, [r1, #4] ; [ o3 | o2] + smultt r3, r6, r12 ; [c1*2217 | ------ ] + str r5, [r1, #20] ; [ o11 | o10] + + smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] + smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] + + smulbb r2, r6, r12 ; [ ------ | c1*5352] + smultb r3, r6, r12 ; [c1*5352 | ------ ] + + lsls r6, r7, #16 ; d1 != 0 ? + addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) + addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) + + asrs r6, r7, #16 + addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) + addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) + + smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 + smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 + + pkhtb r9, r9, r8, asr #16 + + sub r4, r4, r2 + sub r5, r5, r3 + + str r9, [r1, #12] ; [o7 | o6] + pkhtb r5, r5, r4, asr #16 ; [o15|o14] + + str r5, [r1, #28] ; [o15|o14] + + ldmfd sp!, {r4 - r12, pc} + + ENDP + +; Used constants +c7500 + DCD 7500 +c14500 + DCD 14500 +c0x22a453a0 + DCD 0x22a453a0 +c0x00080008 + DCD 0x00080008 +c12000 + DCD 12000 +c51000 + DCD 51000 +c0x00070007 + DCD 0x00070007 +c0x08a914e8 + DCD 0x08a914e8 + + END diff --git a/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm b/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm new file mode 100644 index 000000000..5eaf3f25a --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm @@ -0,0 +1,212 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_walsh4x4_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) +; r0 short *input, +; r1 short *output, +; r2 int pitch +|vp8_short_walsh4x4_armv6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldrd r4, r5, [r0], r2 + ldr lr, c00040004 + ldrd r6, r7, [r0], r2 + + ; 0-3 + qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] + qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] + + ldrd r8, r9, [r0], r2 + ; 4-7 + qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] + qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] + + ldrd r10, r11, [r0] + ; 8-11 + qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] + qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] + + ; 12-15 + qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] + qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] + + + lsls r2, r3, #16 + smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 + addne r11, r11, #1 ; A0 += (a1!=0) + + lsls r2, r7, #16 + smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; C0 += (a1!=0) + + add r0, r11, r12 ; a1_0 = A0 + C0 + sub r11, r11, r12 ; b1_0 = A0 - C0 + + lsls r2, r5, #16 + smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; B0 += (a1!=0) + + lsls r2, r9, #16 + smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 + addne r2, r2, #1 ; D0 += (a1!=0) + + add lr, r12, r2 ; d1_0 = B0 + D0 + sub r12, r12, r2 ; c1_0 = B0 - D0 + + ; op[0,4,8,12] + adds r2, r0, lr ; a2 = a1_0 + d1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r0, lr ; d2 = a1_0 - d1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1] ; op[0] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + ldr lr, c00040004 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #24] ; op[12] + + adds r2, r11, r12 ; b2 = b1_0 + c1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r11, r12 ; c2 = b1_0 - c1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #8] ; op[4] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 + smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #16] ; op[8] + + + ; op[3,7,11,15] + add r0, r3, r7 ; a1_3 = A3 + C3 + sub r3, r3, r7 ; b1_3 = A3 - C3 + + smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 + smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 + add r7, r5, r9 ; d1_3 = B3 + D3 + sub r5, r5, r9 ; c1_3 = B3 - D3 + + adds r2, r0, r7 ; a2 = a1_3 + d1_3 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r5 ; b2 = b1_3 + c1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #6] ; op[3] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r5 ; c2 = b1_3 - c1_3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #14] ; op[7] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r7 ; d2 = a1_3 - d1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #22] ; op[11] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 + smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #30] ; op[15] + + ; op[1,5,9,13] + add r0, r3, r5 ; a1_1 = A1 + C1 + sub r3, r3, r5 ; b1_1 = A1 - C1 + + smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 + smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 + add r5, r7, r9 ; d1_1 = B1 + D1 + sub r7, r7, r9 ; c1_1 = B1 - D1 + + adds r2, r0, r5 ; a2 = a1_1 + d1_1 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r7 ; b2 = b1_1 + c1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #2] ; op[1] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r7 ; c2 = b1_1 - c1_1 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #10] ; op[5] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r5 ; d2 = a1_1 - d1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #18] ; op[9] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 + smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #26] ; op[13] + + + ; op[2,6,10,14] + add r11, r4, r8 ; a1_2 = A2 + C2 + sub r12, r4, r8 ; b1_2 = A2 - C2 + + smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 + smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 + add r4, r6, r10 ; d1_2 = B2 + D2 + sub r8, r6, r10 ; c1_2 = B2 - D2 + + adds r2, r11, r4 ; a2 = a1_2 + d1_2 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r12, r8 ; b2 = b1_2 + c1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #4] ; op[2] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r12, r8 ; c2 = b1_2 - c1_2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #12] ; op[6] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r11, r4 ; d2 = a1_2 - d1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #20] ; op[10] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #28] ; op[14] + + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_walsh4x4_armv6| + +c00040004 + DCD 0x00040004 + + END diff --git a/media/libvpx/vp8/encoder/arm/dct_arm.c b/media/libvpx/vp8/encoder/arm/dct_arm.c new file mode 100644 index 000000000..f71300d2c --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/dct_arm.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +#if HAVE_MEDIA + +void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_armv6(input, output, pitch); + vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch); +} + +#endif /* HAVE_MEDIA */ diff --git a/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c new file mode 100644 index 000000000..08be76e43 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/encoder/denoising.h" +#include "vpx_mem/vpx_mem.h" +#include "./vp8_rtcd.h" + +/* + * The filter function was modified to reduce the computational complexity. + * + * Step 1: + * Instead of applying tap coefficients for each pixel, we calculated the + * pixel adjustments vs. pixel diff value ahead of time. + * adjustment = filtered_value - current_raw + * = (filter_coefficient * diff + 128) >> 8 + * where + * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3)); + * filter_coefficient += filter_coefficient / + * (3 + motion_magnitude_adjustment); + * filter_coefficient is clamped to 0 ~ 255. + * + * Step 2: + * The adjustment vs. diff curve becomes flat very quick when diff increases. + * This allowed us to use only several levels to approximate the curve without + * changing the filtering algorithm too much. + * The adjustments were further corrected by checking the motion magnitude. + * The levels used are: + * diff level adjustment w/o adjustment w/ + * motion correction motion correction + * [-255, -16] 3 -6 -7 + * [-15, -8] 2 -4 -5 + * [-7, -4] 1 -3 -4 + * [-3, 3] 0 diff diff + * [4, 7] 1 3 4 + * [8, 15] 2 4 5 + * [16, 255] 3 6 7 + */ + +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, + int mc_running_avg_y_stride, + unsigned char *running_avg_y, + int running_avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + + /* Go over lines. */ + int r; + for (r = 0; r < 16; ++r) { + /* Load inputs. */ + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, + v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, + v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, + v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, + v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, + v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, + v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = vaddq_u8( + v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, + v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD; + + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_running_avg_y_stride * 16; + running_avg_y -= running_avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg_y -= running_avg_y_stride * 16; + sig -= sig_stride * 16; + + vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride); + + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, + int mc_running_avg_stride, + unsigned char *running_avg, + int running_avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3); + + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + int r; + + { + uint16x4_t v_sum_block = vdup_n_u16(0); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const uint8x8_t v_sig = vld1_u8(sig); + const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig); + v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block); + const uint64x1_t _76543210 = vpaddl_u32(_7654_3210); + const int sum_block = + vget_lane_s32(vreinterpret_s32_u64(_76543210), 0); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + /* Go over lines. */ + for (r = 0; r < 4; ++r) { + /* Load inputs. */ + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, + v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, + v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, + v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, + v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, + v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, + v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = vaddq_u8( + v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, + v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg)); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_running_avg_stride * 8; + running_avg -= running_avg_stride * 8; + for (r = 0; r < 4; ++r) { + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + const uint8x8_t v_running_avg_lo = vld1_u8(running_avg); + const uint8x8_t v_running_avg_hi = + vld1_u8(&running_avg[running_avg_stride]); + uint8x16_t v_running_avg = + vcombine_u8(v_running_avg_lo, v_running_avg_hi); + + v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment); + v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], + vget_high_u8(v_running_avg)); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg -= running_avg_stride * 8; + sig -= sig_stride * 8; + + vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride); + + return FILTER_BLOCK; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c new file mode 100644 index 000000000..e5824bfb2 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp8/encoder/block.h" + +static const uint16_t inv_zig_zag[16] = { + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 +}; + +void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { + const int16x8_t one_q = vdupq_n_s16(-1), + z0 = vld1q_s16(b->coeff), + z1 = vld1q_s16(b->coeff + 8), + round0 = vld1q_s16(b->round), + round1 = vld1q_s16(b->round + 8), + quant0 = vld1q_s16(b->quant_fast), + quant1 = vld1q_s16(b->quant_fast + 8), + dequant0 = vld1q_s16(d->dequant), + dequant1 = vld1q_s16(d->dequant + 8); + const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), + zig_zag1 = vld1q_u16(inv_zig_zag + 8); + int16x8_t x0, x1, sz0, sz1, y0, y1; + uint16x8_t eob0, eob1; + uint16x4_t eob_d16; + uint32x2_t eob_d32; + uint32x4_t eob_q32; + + /* sign of z: z >> 15 */ + sz0 = vshrq_n_s16(z0, 15); + sz1 = vshrq_n_s16(z1, 15); + + /* x = abs(z) */ + x0 = vabsq_s16(z0); + x1 = vabsq_s16(z1); + + /* x += round */ + x0 = vaddq_s16(x0, round0); + x1 = vaddq_s16(x1, round1); + + /* y = 2 * (x * quant) >> 16 */ + y0 = vqdmulhq_s16(x0, quant0); + y1 = vqdmulhq_s16(x1, quant1); + + /* Compensate for doubling in vqdmulhq */ + y0 = vshrq_n_s16(y0, 1); + y1 = vshrq_n_s16(y1, 1); + + /* Restore sign bit */ + y0 = veorq_s16(y0, sz0); + y1 = veorq_s16(y1, sz1); + x0 = vsubq_s16(y0, sz0); + x1 = vsubq_s16(y1, sz1); + + /* find non-zero elements */ + eob0 = vtstq_s16(x0, one_q); + eob1 = vtstq_s16(x1, one_q); + + /* mask zig zag */ + eob0 = vandq_u16(eob0, zig_zag0); + eob1 = vandq_u16(eob1, zig_zag1); + + /* select the largest value */ + eob0 = vmaxq_u16(eob0, eob1); + eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); + eob_q32 = vmovl_u16(eob_d16); + eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); + eob_d32 = vpmax_u32(eob_d32, eob_d32); + + /* qcoeff = x */ + vst1q_s16(d->qcoeff, x0); + vst1q_s16(d->qcoeff + 8, x1); + + /* dqcoeff = x * dequant */ + vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); + vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); + + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +} diff --git a/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c new file mode 100644 index 000000000..391e5f990 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_fdct4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, dEmptys16; + uint16x4_t d4u16; + int16x8_t q0s16, q1s16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + q11s32 = vdupq_n_s32(12000); + q12s32 = vdupq_n_s32(51000); + + // Part one + pitch >>= 1; + d0s16 = vld1_s16(input); + input += pitch; + d1s16 = vld1_s16(input); + input += pitch; + d2s16 = vld1_s16(input); + input += pitch; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d4s16 = vshl_n_s16(d4s16, 3); + d5s16 = vshl_n_s16(d5s16, 3); + d6s16 = vshl_n_s16(d6s16, 3); + d7s16 = vshl_n_s16(d7s16, 3); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q9s32 = vmlal_s16(q9s32, d7s16, d16s16); + q10s32 = vmlal_s16(q10s32, d7s16, d17s16); + q9s32 = vmlal_s16(q9s32, d6s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 12); + d3s16 = vshrn_n_s32(q10s32, 12); + + // Part two + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d26s16 = vdup_n_s16(7); + d4s16 = vadd_s16(d4s16, d26s16); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q11s32 = vmlal_s16(q11s32, d7s16, d16s16); + q12s32 = vmlal_s16(q12s32, d7s16, d17s16); + + dEmptys16 = vdup_n_s16(0); + d4u16 = vceq_s16(d7s16, dEmptys16); + + d0s16 = vshr_n_s16(d0s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + + q11s32 = vmlal_s16(q11s32, d6s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); + + d4u16 = vmvn_u16(d4u16); + d1s16 = vshrn_n_s32(q11s32, 16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); + d3s16 = vshrn_n_s32(q12s32, 16); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} + +void vp8_short_fdct8x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; + uint16x4_t d28u16, d29u16; + uint16x8_t q14u16; + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x8x2_t v2tmp0, v2tmp1; + int32x4x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + + // Part one + pitch >>= 1; + q0s16 = vld1q_s16(input); + input += pitch; + q1s16 = vld1q_s16(input); + input += pitch; + q2s16 = vld1q_s16(input); + input += pitch; + q3s16 = vld1q_s16(input); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q11s16 = vshlq_n_s16(q11s16, 3); + q12s16 = vshlq_n_s16(q12s16, 3); + q13s16 = vshlq_n_s16(q13s16, 3); + q14s16 = vshlq_n_s16(q14s16, 3); + + q0s16 = vaddq_s16(q11s16, q12s16); + q2s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d2s16 = vshrn_n_s32(q9s32, 12); + d6s16 = vshrn_n_s32(q10s32, 12); + d3s16 = vshrn_n_s32(q11s32, 12); + d7s16 = vshrn_n_s32(q12s32, 12); + q1s16 = vcombine_s16(d2s16, d3s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + // Part two + q9s32 = vdupq_n_s32(12000); + q10s32 = vdupq_n_s32(51000); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q15s16 = vdupq_n_s16(7); + q11s16 = vaddq_s16(q11s16, q15s16); + q0s16 = vaddq_s16(q11s16, q12s16); + q1s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d2s16 = vget_low_s16(q1s16); + d3s16 = vget_high_s16(q1s16); + + d0s16 = vshr_n_s16(d0s16, 4); + d4s16 = vshr_n_s16(d1s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + d6s16 = vshr_n_s16(d3s16, 4); + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 16); + d3s16 = vshrn_n_s32(q10s32, 16); + d5s16 = vshrn_n_s32(q11s32, 16); + d7s16 = vshrn_n_s32(q12s32, 16); + + qEmptys16 = vdupq_n_s16(0); + q14u16 = vceqq_s16(q14s16, qEmptys16); + q14u16 = vmvnq_u16(q14u16); + + d28u16 = vget_low_u16(q14u16); + d29u16 = vget_high_u16(q14u16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); + d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + vst1q_s16(output + 16, q2s16); + vst1q_s16(output + 24, q3s16); + return; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c new file mode 100644 index 000000000..d3ab7b165 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vp8/encoder/block.h" + +void vp8_subtract_b_neon( + BLOCK *be, + BLOCKD *bd, + int pitch) { + unsigned char *src_ptr, *predictor; + int src_stride; + int16_t *src_diff; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q10u16, q11u16, q12u16, q13u16; + + src_ptr = *be->base_src + be->src; + src_stride = be->src_stride; + predictor = bd->predictor; + + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d6u8 = vld1_u8(src_ptr); + + d1u8 = vld1_u8(predictor); + predictor += pitch; + d3u8 = vld1_u8(predictor); + predictor += pitch; + d5u8 = vld1_u8(predictor); + predictor += pitch; + d7u8 = vld1_u8(predictor); + + q10u16 = vsubl_u8(d0u8, d1u8); + q11u16 = vsubl_u8(d2u8, d3u8); + q12u16 = vsubl_u8(d4u8, d5u8); + q13u16 = vsubl_u8(d6u8, d7u8); + + src_diff = be->src_diff; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16)); + return; +} + +void vp8_subtract_mby_neon( + int16_t *diff, + unsigned char *src, + int src_stride, + unsigned char *pred, + int pred_stride) { + int i; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + for (i = 0; i < 8; i++) { // subtract_mby_loop + q0u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(pred); + pred += pred_stride; + q3u8 = vld1q_u8(pred); + pred += pred_stride; + + q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8)); + q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8)); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + return; +} + +void vp8_subtract_mbuv_neon( + int16_t *diff, + unsigned char *usrc, + unsigned char *vsrc, + int src_stride, + unsigned char *upred, + unsigned char *vpred, + int pred_stride) { + int i, j; + unsigned char *src_ptr, *pred_ptr; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + diff += 256; + for (i = 0; i < 2; i++) { + if (i == 0) { + src_ptr = usrc; + pred_ptr = upred; + } else if (i == 1) { + src_ptr = vsrc; + pred_ptr = vpred; + } + + for (j = 0; j < 2; j++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d1u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d3u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d5u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d6u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d7u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + + q8u16 = vsubl_u8(d0u8, d1u8); + q9u16 = vsubl_u8(d2u8, d3u8); + q10u16 = vsubl_u8(d4u8, d5u8); + q11u16 = vsubl_u8(d6u8, d7u8); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + } + return; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c new file mode 100644 index 000000000..5ad946500 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +#include "./vp8_rtcd.h" +void vp8_short_walsh4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + vp8_short_walsh4x4_c(input, output, pitch); +} +#else +void vp8_short_walsh4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + uint16x4_t d16u16; + int16x8_t q0s16, q1s16; + int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q15s32; + uint32x4_t q8u32, q9u32, q10u32, q11u32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + dEmptys16 = vdup_n_s16(0); + qEmptys32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(3); + + d0s16 = vld1_s16(input); + input += pitch/2; + d1s16 = vld1_s16(input); + input += pitch/2; + d2s16 = vld1_s16(input); + input += pitch/2; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]); + + d4s16 = vshl_n_s16(d4s16, 2); + d5s16 = vshl_n_s16(d5s16, 2); + d6s16 = vshl_n_s16(d6s16, 2); + d7s16 = vshl_n_s16(d7s16, 2); + + d16u16 = vceq_s16(d4s16, dEmptys16); + d16u16 = vmvn_u16(d16u16); + + d0s16 = vadd_s16(d4s16, d5s16); + d3s16 = vsub_s16(d4s16, d5s16); + d1s16 = vadd_s16(d7s16, d6s16); + d2s16 = vsub_s16(d7s16, d6s16); + + d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16)); + + // Second for-loop + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2 + vreinterpret_s16_s32(v2tmp2.val[1])); // d3 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0 + vreinterpret_s16_s32(v2tmp2.val[0])); // d1 + + q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]); + q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]); + + q0s32 = vaddq_s32(q8s32, q9s32); + q1s32 = vaddq_s32(q11s32, q10s32); + q2s32 = vsubq_s32(q11s32, q10s32); + q3s32 = vsubq_s32(q8s32, q9s32); + + q8u32 = vcltq_s32(q0s32, qEmptys32); + q9u32 = vcltq_s32(q1s32, qEmptys32); + q10u32 = vcltq_s32(q2s32, qEmptys32); + q11u32 = vcltq_s32(q3s32, qEmptys32); + + q8s32 = vreinterpretq_s32_u32(q8u32); + q9s32 = vreinterpretq_s32_u32(q9u32); + q10s32 = vreinterpretq_s32_u32(q10u32); + q11s32 = vreinterpretq_s32_u32(q11u32); + + q0s32 = vsubq_s32(q0s32, q8s32); + q1s32 = vsubq_s32(q1s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q3s32 = vsubq_s32(q3s32, q11s32); + + q8s32 = vaddq_s32(q0s32, q15s32); + q9s32 = vaddq_s32(q1s32, q15s32); + q10s32 = vaddq_s32(q2s32, q15s32); + q11s32 = vaddq_s32(q3s32, q15s32); + + d0s16 = vshrn_n_s32(q8s32, 3); + d1s16 = vshrn_n_s32(q9s32, 3); + d2s16 = vshrn_n_s32(q10s32, 3); + d3s16 = vshrn_n_s32(q11s32, 3); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} +#endif // VPX_INCOMPATIBLE_GCC |