diff options
Diffstat (limited to 'media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm')
-rw-r--r-- | media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm b/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 000000000..9704b4210 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END |