diff options
Diffstat (limited to 'media/libvpx/vp8/common')
132 files changed, 37802 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/alloccommon.c b/media/libvpx/vp8/common/alloccommon.c new file mode 100644 index 000000000..8dfd4ce20 --- /dev/null +++ b/media/libvpx/vp8/common/alloccommon.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "alloccommon.h" +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "onyxc_int.h" +#include "findnearmv.h" +#include "entropymode.h" +#include "systemdependent.h" + +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) +{ + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); + + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); +#if CONFIG_POSTPROC + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + if (oci->post_proc_buffer_int_used) + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int); + + vpx_free(oci->pp_limits_buffer); + oci->pp_limits_buffer = NULL; +#endif + + vpx_free(oci->above_context); + vpx_free(oci->mip); +#if CONFIG_ERROR_CONCEALMENT + vpx_free(oci->prev_mip); + oci->prev_mip = NULL; +#endif + + oci->above_context = NULL; + oci->mip = NULL; +} + +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) +{ + int i; + + vp8_de_alloc_frame_buffers(oci); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + for (i = 0; i < NUM_YV12_BUFFERS; i++) + { + oci->fb_idx_ref_cnt[i] = 0; + oci->yv12_fb[i].flags = 0; + if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + } + + oci->new_fb_idx = 0; + oci->lst_fb_idx = 1; + oci->gld_fb_idx = 2; + oci->alt_fb_idx = 3; + + oci->fb_idx_ref_cnt[0] = 1; + oci->fb_idx_ref_cnt[1] = 1; + oci->fb_idx_ref_cnt[2] = 1; + oci->fb_idx_ref_cnt[3] = 1; + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) + goto allocation_fail; + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + /* Allocation of previous mode info will be done in vp8_decode_frame() + * as it is a decoder only data */ + + oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + + if (!oci->above_context) + goto allocation_fail; + +#if CONFIG_POSTPROC + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->post_proc_buffer_int_used = 0; + memset(&oci->postproc_state, 0, sizeof(oci->postproc_state)); + memset(oci->post_proc_buffer.buffer_alloc, 128, + oci->post_proc_buffer.frame_size); + + /* Allocate buffer to store post-processing filter coefficients. + * + * Note: Round up mb_cols to support SIMD reads + */ + oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1)); + if (!oci->pp_limits_buffer) + goto allocation_fail; +#endif + + return 0; + +allocation_fail: + vp8_de_alloc_frame_buffers(oci); + return 1; +} + +void vp8_setup_version(VP8_COMMON *cm) +{ + switch (cm->version) + { + case 0: + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + cm->no_lpf = 1; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 3: + cm->no_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 1; + break; + default: + /*4,5,6,7 are reserved for future use*/ + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + } +} +void vp8_create_common(VP8_COMMON *oci) +{ + vp8_machine_specific_config(oci); + + vp8_init_mbmode_probs(oci); + vp8_default_bmode_probs(oci->fc.bmode_prob); + + oci->mb_no_coeff_skip = 1; + oci->no_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->multi_token_partition = ONE_PARTITION; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + /* Initialize reference frame sign bias structure to defaults */ + memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + /* Default disable buffer to buffer copying */ + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; +} + +void vp8_remove_common(VP8_COMMON *oci) +{ + vp8_de_alloc_frame_buffers(oci); +} diff --git a/media/libvpx/vp8/common/alloccommon.h b/media/libvpx/vp8/common/alloccommon.h new file mode 100644 index 000000000..93e99d76b --- /dev/null +++ b/media/libvpx/vp8/common/alloccommon.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ALLOCCOMMON_H_ +#define VP8_COMMON_ALLOCCOMMON_H_ + +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_create_common(VP8_COMMON *oci); +void vp8_remove_common(VP8_COMMON *oci); +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); +void vp8_setup_version(VP8_COMMON *oci); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ALLOCCOMMON_H_ diff --git a/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm b/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm new file mode 100644 index 000000000..9704b4210 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_bil_first_pass_armv6| + EXPORT |vp8_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp8_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_armv6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm b/media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm new file mode 100644 index 000000000..abf048c2f --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm @@ -0,0 +1,186 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0, #31] ; preload for next 16x16 block + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp8_copy_mem16x16_v6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm b/media/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm new file mode 100644 index 000000000..d8362ef05 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x4_v6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm b/media/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm new file mode 100644 index 000000000..c6a60c610 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp8_copy_mem8x8_v6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/media/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm new file mode 100644 index 000000000..9aa659fa7 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm @@ -0,0 +1,70 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dc_only_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, +; int pred_stride, unsigned char *dst_ptr, +; int dst_stride) +; r0 input_dc +; r1 pred_ptr +; r2 pred_stride +; r3 dst_ptr +; sp dst_stride + +|vp8_dc_only_idct_add_v6| PROC + stmdb sp!, {r4 - r7} + + add r0, r0, #4 ; input_dc += 4 + ldr r12, c0x0000FFFF + ldr r4, [r1], r2 + and r0, r12, r0, asr #3 ; input_dc >> 3 + mask + ldr r6, [r1], r2 + orr r0, r0, r0, lsl #16 ; a1 | a1 + + ldr r12, [sp, #16] ; dst stride + + uxtab16 r5, r0, r4 ; a1+2 | a1+0 + uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + ldr r4, [r1], r2 + str r5, [r3], r12 + ldr r6, [r1] + str r7, [r3], r12 + + uxtab16 r5, r0, r4 + uxtab16 r4, r0, r4, ror #8 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + str r5, [r3], r12 + str r7, [r3] + + ldmia sp!, {r4 - r7} + bx lr + + ENDP ; |vp8_dc_only_idct_add_v6| + +; Constant Pool +c0x0000FFFF DCD 0x0000FFFF + END diff --git a/media/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/media/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm new file mode 100644 index 000000000..db48ded58 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm @@ -0,0 +1,190 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dequant_idct_add_v6| + + AREA |.text|, CODE, READONLY +;void vp8_dequant_idct_v6(short *input, short *dq, +; unsigned char *dest, int stride) +; r0 = q +; r1 = dq +; r2 = dst +; r3 = stride + +|vp8_dequant_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + mov r12, #4 + +vp8_dequant_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp] ; get stride from stack + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2] ; load input from dst + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2, r12] ; load input from dst + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [r2], r12 ; store output to dst + str r1, [r2], r12 ; store output to dst + bne vp8_dequant_idct_loop2_v6 + +; memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/media/libvpx/vp8/common/arm/armv6/dequantize_v6.asm b/media/libvpx/vp8/common/arm/armv6/dequantize_v6.asm new file mode 100644 index 000000000..72f7e0ee5 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/dequantize_v6.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_dequantize_b_loop_v6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------- +;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); +; r0 short *Q, +; r1 short *DQC +; r2 short *DQ +|vp8_dequantize_b_loop_v6| PROC + stmdb sp!, {r4-r9, lr} + + ldr r3, [r0] ;load Q + ldr r4, [r1] ;load DQC + ldr r5, [r0, #4] + ldr r6, [r1, #4] + + mov r12, #2 ;loop counter + +dequant_loop + smulbb r7, r3, r4 ;multiply + smultt r8, r3, r4 + smulbb r9, r5, r6 + smultt lr, r5, r6 + + ldr r3, [r0, #8] + ldr r4, [r1, #8] + ldr r5, [r0, #12] + ldr r6, [r1, #12] + + strh r7, [r2], #2 ;store result + smulbb r7, r3, r4 ;multiply + strh r8, [r2], #2 + smultt r8, r3, r4 + strh r9, [r2], #2 + smulbb r9, r5, r6 + strh lr, [r2], #2 + smultt lr, r5, r6 + + subs r12, r12, #1 + + add r0, r0, #16 + add r1, r1, #16 + + ldrne r3, [r0] + strh r7, [r2], #2 ;store result + ldrne r4, [r1] + strh r8, [r2], #2 + ldrne r5, [r0, #4] + strh r9, [r2], #2 + ldrne r6, [r1, #4] + strh lr, [r2], #2 + + bne dequant_loop + + ldmia sp!, {r4-r9, pc} + ENDP ;|vp8_dequantize_b_loop_v6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/filter_v6.asm b/media/libvpx/vp8/common/arm/armv6/filter_v6.asm new file mode 100644 index 000000000..eb4b75bd8 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/filter_v6.asm @@ -0,0 +1,624 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_filter_block2d_first_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| + EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| + EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter4_block2d_second_pass_armv6| + EXPORT |vp8_filter_block2d_first_pass_only_armv6| + EXPORT |vp8_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp8_filter +;------------------------------------- +; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp8_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 16x16 version +; ----------------------------- +|vp8_filter_block2d_first_pass_16x16_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #18 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_16_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_16_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_16_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #34 ; adding back block width(=16) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_16_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 8x8 version +; ----------------------------- +|vp8_filter_block2d_first_pass_8x8_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp8_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #10 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_8_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_8_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_8_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #18 ; adding back block width(=8) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_8_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter4_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + mov r7, r3, lsl #16 ; height is top part of counter + + ldr r4, [r11] ; load up packed filter coefficients + add lr, r1, r3 ; save final destination pointer + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + mov r4, #0x40 ; rounding factor (for smlad{x}) + +|height_loop_2nd_4| + ldrd r8, r9, [r0, #-4] ; load the data + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd_4| + ldr r10, [r0, #4]! + smladx r6, r9, r12, r4 ; apply filter + pkhbt r8, r9, r8 + smlad r5, r8, r12, r4 + pkhbt r8, r10, r9 + smladx r6, r10, r11, r6 + sub r7, r7, #1 + smlad r5, r8, r11, r5 + + mov r8, r9 ; shift the data for the next loop + mov r9, r10 + + usat r6, #8, r6, asr #7 ; shift and clamp + usat r5, #8, r5, asr #7 + + strb r5, [r1], r2 ; the result is transposed back and stored + tst r7, #0xff + strb r6, [r1], r2 + + bne width_loop_2nd_4 + + subs r7, r7, #0x10000 + add r0, r0, #16 ; update src for next loop + sub r1, lr, r7, lsr #16 ; update dst for next loop + + bne height_loop_2nd_4 + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + add r7, r2, r3 ; preload next low + add r7, r7, #2 + pld [r0, r7] + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + + add r11, r12, r3 ; preload next low + add r11, r11, #2 + pld [r0, r11] + + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp8_filter +;------------------------------------ +|vp8_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_filter_block2d_second_pass_only_armv6| + + END diff --git a/media/libvpx/vp8/common/arm/armv6/idct_blk_v6.c b/media/libvpx/vp8/common/arm/armv6/idct_blk_v6.c new file mode 100644 index 000000000..c94f84a62 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/idct_blk_v6.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + + +void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/media/libvpx/vp8/common/arm/armv6/idct_v6.asm b/media/libvpx/vp8/common/arm/armv6/idct_v6.asm new file mode 100644 index 000000000..b4d44cbeb --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/idct_v6.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_v6_dual| + + AREA |.text|, CODE, READONLY + + +; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, +; unsigned char *dst, int stride) +; r0 short* input +; r1 unsigned char* pred +; r2 int pitch +; r3 unsigned char* dst +; sp int stride + +|vp8_short_idct4x4llm_v6_dual| PROC + stmdb sp!, {r4-r11, lr} + + sub sp, sp, #4 + + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + + mov r5, #0x00004E00 ; cos + orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 + orr r5, r5, #1<<31 ; loop counter on top bit + +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12, [r0, #(12*2)] ; i13|i12 + ldr r14, [r0, #(8*2)] ; i9 | i8 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 + + smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 + pkhtb r7, r9, r7, asr #16 ; 5c | 4c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 + smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 + + subs r5, r5, #1<<31 ; i-- + + pkhtb r9, r11, r9, asr #16 ; 13c | 12c + ldr r11, [r0] ; i1 | i0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + + usub16 r7, r8, r7 ; c + uadd16 r6, r6, r10 ; d + uadd16 r10, r11, r14 ; a + usub16 r8, r11, r14 ; b + + uadd16 r9, r10, r6 ; a+d + usub16 r10, r10, r6 ; a-d + uadd16 r6, r8, r7 ; b+c + usub16 r7, r8, r7 ; b-c + + ; use input buffer to store intermediate results + str r6, [r0, #(4*2)] ; o5 | o4 + str r7, [r0, #(8*2)] ; o9 | o8 + str r10,[r0, #(12*2)] ; o13|o12 + str r9, [r0], #4 ; o1 | o0 + + bcs loop1_dual + + sub r0, r0, #8 ; reset input/output + str r0, [sp] + +loop2_dual + + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12,[r0, #(2*2)] ; i3 | i2 + ldr r14,[r0, #(6*2)] ; i7 | i6 + ldr r0, [r0, #(0*2)] ; i1 | i0 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 + + pkhbt r11, r6, r0, lsl #16 ; i0 | i4 + pkhtb r7, r7, r9, asr #16 ; 1c | 5c + pkhtb r0, r0, r6, asr #16 ; i1 | i5 + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 + + uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 + uadd16 r10, r11, r9 ; a + usub16 r9, r11, r9 ; b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 + + subs r5, r5, #1<<31 ; i-- + + smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 + smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 + + pkhtb r7, r7, r12, asr #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 + + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 + usub16 r12, r8, r6 ; c (o1 | o5) + uadd16 r6, r11, r0 ; d (o3 | o7) + uadd16 r7, r10, r6 ; a+d + + mov r8, #4 ; set up 4's + orr r8, r8, #0x40000 ; 4|4 + + usub16 r6, r10, r6 ; a-d + uadd16 r6, r6, r8 ; a-d+4, 3|7 + uadd16 r7, r7, r8 ; a+d+4, 0|4 + uadd16 r10, r9, r12 ; b+c + usub16 r0, r9, r12 ; b-c + uadd16 r10, r10, r8 ; b+c+4, 1|5 + uadd16 r8, r0, r8 ; b-c+4, 2|6 + + ldr lr, [sp, #40] ; dst stride + + ldrb r0, [r1] ; pred p0 + ldrb r11, [r1, #1] ; pred p1 + ldrb r12, [r1, #2] ; pred p2 + + add r0, r0, r7, asr #19 ; p0 + o0 + add r11, r11, r10, asr #19 ; p1 + o1 + add r12, r12, r8, asr #19 ; p2 + o2 + + usat r0, #8, r0 ; d0 = clip8(p0 + o0) + usat r11, #8, r11 ; d1 = clip8(p1 + o1) + usat r12, #8, r12 ; d2 = clip8(p2 + o2) + + add r0, r0, r11, lsl #8 ; |--|--|d1|d0| + + ldrb r11, [r1, #3] ; pred p3 + + add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| + + add r11, r11, r6, asr #19 ; p3 + o3 + + sxth r7, r7 ; + sxth r10, r10 ; + + usat r11, #8, r11 ; d3 = clip8(p3 + o3) + + sxth r8, r8 ; + sxth r6, r6 ; + + add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| + + ldrb r12, [r1, r2]! ; pred p4 + str r0, [r3], lr + ldrb r11, [r1, #1] ; pred p5 + + add r12, r12, r7, asr #3 ; p4 + o4 + add r11, r11, r10, asr #3 ; p5 + o5 + + usat r12, #8, r12 ; d4 = clip8(p4 + o4) + usat r11, #8, r11 ; d5 = clip8(p5 + o5) + + ldrb r7, [r1, #2] ; pred p6 + ldrb r10, [r1, #3] ; pred p6 + + add r12, r12, r11, lsl #8 ; |--|--|d5|d4| + + add r7, r7, r8, asr #3 ; p6 + o6 + add r10, r10, r6, asr #3 ; p7 + o7 + + ldr r0, [sp] ; load input pointer + + usat r7, #8, r7 ; d6 = clip8(p6 + o6) + usat r10, #8, r10 ; d7 = clip8(p7 + o7) + + add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| + add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| + + str r12, [r3], lr + add r0, r0, #16 + add r1, r1, r2 ; pred + pitch + + bcs loop2_dual + + add sp, sp, #4 ; idct_output buffer + ldmia sp!, {r4 - r11, pc} + + ENDP + + END diff --git a/media/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/media/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm new file mode 100644 index 000000000..c5ec824b3 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm @@ -0,0 +1,611 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_intra4x4_predict_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + +;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, +; B_PREDICTION_MODE left_stride, int b_mode, +; unsigned char *dst, int dst_stride, +; unsigned char top_left) + +; r0: *Above +; r1: *yleft +; r2: left_stride +; r3: b_mode +; sp + #40: dst +; sp + #44: dst_stride +; sp + #48: top_left +|vp8_intra4x4_predict_armv6| PROC + push {r4-r12, lr} + + cmp r3, #10 + addlt pc, pc, r3, lsl #2 ; position independent switch + pop {r4-r12, pc} ; default + b b_dc_pred + b b_tm_pred + b b_ve_pred + b b_he_pred + b b_ld_pred + b b_rd_pred + b b_vr_pred + b b_vl_pred + b b_hd_pred + b b_hu_pred + +b_dc_pred + ; load values + ldr r8, [r0] ; Above + ldrb r4, [r1], r2 ; Left[0] + mov r9, #0 + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + usad8 r12, r8, r9 + ldrb r7, [r1] ; Left[3] + + ; calculate dc + add r4, r4, r5 + add r4, r4, r6 + add r4, r4, r7 + add r4, r4, r12 + add r4, r4, #4 + ldr r0, [sp, #44] ; dst_stride + mov r12, r4, asr #3 ; (expected_dc + 4) >> 3 + + add r12, r12, r12, lsl #8 + ldr r3, [sp, #40] ; dst + add r12, r12, r12, lsl #16 + + ; store values + str r12, [r3], r0 + str r12, [r3], r0 + str r12, [r3], r0 + str r12, [r3] + + pop {r4-r12, pc} + +b_tm_pred + ldr r8, [r0] ; Above + ldrb r9, [sp, #48] ; top_left + ldrb r4, [r1], r2 ; Left[0] + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + ldrb r7, [r1] ; Left[3] + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + add r9, r9, r9, lsl #16 ; [tl|tl] + uxtb16 r10, r8 ; a[2|0] + uxtb16 r11, r8, ror #8 ; a[3|1] + ssub16 r10, r10, r9 ; a[2|0] - [tl|tl] + ssub16 r11, r11, r9 ; a[3|1] - [tl|tl] + + add r4, r4, r4, lsl #16 ; l[0|0] + add r5, r5, r5, lsl #16 ; l[1|1] + add r6, r6, r6, lsl #16 ; l[2|2] + add r7, r7, r7, lsl #16 ; l[3|3] + + sadd16 r1, r4, r10 ; l[0|0] + a[2|0] - [tl|tl] + sadd16 r2, r4, r11 ; l[0|0] + a[3|1] - [tl|tl] + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r5, r10 ; l[1|1] + a[2|0] - [tl|tl] + sadd16 r5, r5, r11 ; l[1|1] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + sadd16 r1, r6, r10 ; l[2|2] + a[2|0] - [tl|tl] + sadd16 r2, r6, r11 ; l[2|2] + a[3|1] - [tl|tl] + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r7, r10 ; l[3|3] + a[2|0] - [tl|tl] + sadd16 r5, r7, r11 ; l[3|3] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + str r12, [r3], r0 + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3] + + pop {r4-r12, pc} + +b_ve_pred + ldr r8, [r0] ; a[3|2|1|0] + ldr r11, c00FF00FF + ldrb r9, [sp, #48] ; top_left + ldrb r10, [r0, #4] ; a[4] + + ldr r0, c00020002 + + uxtb16 r4, r8 ; a[2|0] + uxtb16 r5, r8, ror #8 ; a[3|1] + ldr r2, [sp, #44] ; dst_stride + pkhbt r9, r9, r5, lsl #16 ; a[1|-1] + + add r9, r9, r4, lsl #1 ;[a[1]+2*a[2] | tl+2*a[0] ] + uxtab16 r9, r9, r5 ;[a[1]+2*a[2]+a[3] | tl+2*a[0]+a[1] ] + ldr r3, [sp, #40] ; dst + uxtab16 r9, r9, r0 ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2] + + add r0, r0, r10, lsl #16 ;[a[4]+2 | 2] + add r0, r0, r4, asr #16 ;[a[4]+2 | a[2]+2] + add r0, r0, r5, lsl #1 ;[a[4]+2*a[3]+2 | a[2]+2*a[1]+2] + uadd16 r4, r4, r0 ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2] + + and r9, r11, r9, asr #2 + and r4, r11, r4, asr #2 + add r9, r9, r4, lsl #8 + + ; store values + str r9, [r3], r2 + str r9, [r3], r2 + str r9, [r3], r2 + str r9, [r3] + + pop {r4-r12, pc} + + +b_he_pred + ldrb r4, [r1], r2 ; Left[0] + ldrb r8, [sp, #48] ; top_left + ldrb r5, [r1], r2 ; Left[1] + ldrb r6, [r1], r2 ; Left[2] + ldrb r7, [r1] ; Left[3] + + add r8, r8, r4 ; tl + l[0] + add r9, r4, r5 ; l[0] + l[1] + add r10, r5, r6 ; l[1] + l[2] + add r11, r6, r7 ; l[2] + l[3] + + mov r0, #2<<14 + + add r8, r8, r9 ; tl + 2*l[0] + l[1] + add r4, r9, r10 ; l[0] + 2*l[1] + l[2] + add r5, r10, r11 ; l[1] + 2*l[2] + l[3] + add r6, r11, r7, lsl #1 ; l[2] + 2*l[3] + l[3] + + + add r8, r0, r8, lsl #14 ; (tl + 2*l[0] + l[1])>>2 in top half + add r9, r0, r4, lsl #14 ; (l[0] + 2*l[1] + l[2])>>2 in top half + add r10,r0, r5, lsl #14 ; (l[1] + 2*l[2] + l[3])>>2 in top half + add r11,r0, r6, lsl #14 ; (l[2] + 2*l[3] + l[3])>>2 in top half + + pkhtb r8, r8, r8, asr #16 ; l[-|0|-|0] + pkhtb r9, r9, r9, asr #16 ; l[-|1|-|1] + pkhtb r10, r10, r10, asr #16 ; l[-|2|-|2] + pkhtb r11, r11, r11, asr #16 ; l[-|3|-|3] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + add r8, r8, r8, lsl #8 ; l[0|0|0|0] + add r9, r9, r9, lsl #8 ; l[1|1|1|1] + add r10, r10, r10, lsl #8 ; l[2|2|2|2] + add r11, r11, r11, lsl #8 ; l[3|3|3|3] + + ; store values + str r8, [r3], r0 + str r9, [r3], r0 + str r10, [r3], r0 + str r11, [r3] + + pop {r4-r12, pc} + +b_ld_pred + ldr r4, [r0] ; Above[0-3] + ldr r12, c00020002 + ldr r5, [r0, #4] ; Above[4-7] + ldr lr, c00FF00FF + + uxtb16 r6, r4 ; a[2|0] + uxtb16 r7, r4, ror #8 ; a[3|1] + uxtb16 r8, r5 ; a[6|4] + uxtb16 r9, r5, ror #8 ; a[7|5] + pkhtb r10, r6, r8 ; a[2|4] + pkhtb r11, r7, r9 ; a[3|5] + + add r4, r6, r7, lsl #1 ; [a2+2*a3 | a0+2*a1] + add r4, r4, r10, ror #16 ; [a2+2*a3+a4 | a0+2*a1+a2] + uxtab16 r4, r4, r12 ; [a2+2*a3+a4+2 | a0+2*a1+a2+2] + + add r5, r7, r10, ror #15 ; [a3+2*a4 | a1+2*a2] + add r5, r5, r11, ror #16 ; [a3+2*a4+a5 | a1+2*a2+a3] + uxtab16 r5, r5, r12 ; [a3+2*a4+a5+2 | a1+2*a2+a3+2] + + pkhtb r7, r9, r8, asr #16 + add r6, r8, r9, lsl #1 ; [a6+2*a7 | a4+2*a5] + uadd16 r6, r6, r7 ; [a6+2*a7+a7 | a4+2*a5+a6] + uxtab16 r6, r6, r12 ; [a6+2*a7+a7+2 | a4+2*a5+a6+2] + + uxth r7, r9 ; [ a5] + add r7, r7, r8, asr #15 ; [ a5+2*a6] + add r7, r7, r9, asr #16 ; [ a5+2*a6+a7] + uxtah r7, r7, r12 ; [ a5+2*a6+a7+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r4, lr, r4, asr #2 + and r5, lr, r5, asr #2 + and r6, lr, r6, asr #2 + mov r7, r7, asr #2 + + add r8, r4, r5, lsl #8 ; [3|2|1|0] + str r8, [r3], r0 + + mov r9, r8, lsr #8 + add r9, r9, r6, lsl #24 ; [4|3|2|1] + str r9, [r3], r0 + + mov r10, r9, lsr #8 + add r10, r10, r7, lsl #24 ; [5|4|3|2] + str r10, [r3], r0 + + mov r6, r6, lsr #16 + mov r11, r10, lsr #8 + add r11, r11, r6, lsl #24 ; [6|5|4|3] + str r11, [r3] + + pop {r4-r12, pc} + +b_rd_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1], r2 ; l[3] = pp[0] + + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + add r4, r4, r6, lsl #16 ; p[2|0] + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r4, r4, r5, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r4, r4, r6 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r6, r6, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r6, r6, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + add r7, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r7, r7, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r7, r7, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r7, lr, r7, asr #2 + and r6, lr, r6, asr #2 + and r5, lr, r5, asr #2 + and r4, lr, r4, asr #2 + + add r8, r6, r7, lsl #8 ; [6|5|4|3] + str r8, [r3], r0 + + mov r9, r8, lsl #8 ; [5|4|3|-] + uxtab r9, r9, r4, ror #16 ; [5|4|3|2] + str r9, [r3], r0 + + mov r10, r9, lsl #8 ; [4|3|2|-] + uxtab r10, r10, r5 ; [4|3|2|1] + str r10, [r3], r0 + + mov r11, r10, lsl #8 ; [3|2|1|-] + uxtab r11, r11, r4 ; [3|2|1|0] + str r11, [r3] + + pop {r4-r12, pc} + +b_vr_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1] ; l[3] = pp[0] + + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r4, c00010001 + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r6, r7, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r6, r6, r8 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r6, r6, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + uadd16 r11, r8, r9 ; [p6+p7 | p4+p5] + uhadd16 r11, r11, r4 ; [(p6+p7+1)>>1 | (p4+p5+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r7, r7, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r7, r7, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + uadd16 r2, r9, r10 ; [p7+p8 | p5+p6] + uhadd16 r2, r2, r4 ; [(p7+p8+1)>>1 | (p5+p6+1)>>1] + ; [J|I] + + add r8, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r8, r8, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r8, r8, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r5, lr, r5, asr #2 ; [B|A] + and r6, lr, r6, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [L|K] + + add r12, r11, r2, lsl #8 ; [J|F|I|E] + str r12, [r3], r0 + + add r12, r7, r8, lsl #8 ; [L|H|K|G] + str r12, [r3], r0 + + pkhbt r2, r6, r2, lsl #16 ; [-|I|-|C] + add r2, r2, r11, lsl #8 ; [F|I|E|C] + + pkhtb r12, r6, r5 ; [-|D|-|A] + pkhtb r10, r7, r5, asr #16 ; [-|H|-|B] + str r2, [r3], r0 + add r12, r12, r10, lsl #8 ; [H|D|B|A] + str r12, [r3] + + pop {r4-r12, pc} + +b_vl_pred + ldr r4, [r0] ; [3|2|1|0] = Above[0-3] + ldr r12, c00020002 + ldr r5, [r0, #4] ; [7|6|5|4] = Above[4-7] + ldr lr, c00FF00FF + ldr r2, c00010001 + + mov r0, r4, lsr #16 ; [-|-|3|2] + add r0, r0, r5, lsl #16 ; [5|4|3|2] + uxtb16 r6, r4 ; [2|0] + uxtb16 r7, r4, ror #8 ; [3|1] + uxtb16 r8, r0 ; [4|2] + uxtb16 r9, r0, ror #8 ; [5|3] + uxtb16 r10, r5 ; [6|4] + uxtb16 r11, r5, ror #8 ; [7|5] + + uadd16 r4, r6, r7 ; [p2+p3 | p0+p1] + uhadd16 r4, r4, r2 ; [(p2+p3+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r5, r6, r7, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r5, r5, r8 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r5, r5, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + uadd16 r6, r7, r8 ; [p3+p4 | p1+p2] + uhadd16 r6, r6, r2 ; [(p3+p4+1)>>1 | (p1+p2+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r7, r7, r9 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r7, r7, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r8, r8, r9, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r8, r8, r10 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r8, r8, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + add r9, r9, r10, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r9, r9, r11 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r9, r9, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + ldr r0, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + ; scale down + and r5, lr, r5, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [I|D] + and r9, lr, r9, asr #2 ; [J|H] + + add r10, r4, r6, lsl #8 ; [F|B|E|A] + str r10, [r3], r0 + + add r5, r5, r7, lsl #8 ; [H|C|G|D] + str r5, [r3], r0 + + pkhtb r12, r8, r4, asr #16 ; [-|I|-|B] + pkhtb r10, r9, r8 ; [-|J|-|D] + + add r12, r6, r12, lsl #8 ; [I|F|B|E] + str r12, [r3], r0 + + add r10, r7, r10, lsl #8 ; [J|H|D|G] + str r10, [r3] + + pop {r4-r12, pc} + +b_hd_pred + ldrb r7, [r1], r2 ; l[0] = pp[3] + ldr lr, [r0] ; Above = pp[8|7|6|5] + ldrb r8, [sp, #48] ; tl = pp[4] + ldrb r6, [r1], r2 ; l[1] = pp[2] + ldrb r5, [r1], r2 ; l[2] = pp[1] + ldrb r4, [r1] ; l[3] = pp[0] + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + + add r4, r4, r5, lsl #16 ; p[1|0] + add r5, r5, r6, lsl #16 ; p[2|1] + add r6, r6, r7, lsl #16 ; p[3|2] + add r7, r7, r8, lsl #16 ; p[4|3] + + ldr r12, c00020002 + ldr lr, c00FF00FF + ldr r2, c00010001 + + pkhtb r8, r7, r9 ; p[4|5] + pkhtb r1, r9, r10 ; p[7|6] + pkhbt r10, r8, r10, lsl #16 ; p[6|5] + + uadd16 r11, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r11, r11, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r6 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + + uadd16 r0, r6, r7 ; [p3+p4 | p2+p3] + uhadd16 r0, r0, r2 ; [(p3+p4+1)>>1 | (p2+p3+1)>>1] + ; [F|E] + + add r5, r6, r7, lsl #1 ; [p3+2*p4 | p2+2*p3] + add r5, r5, r8, ror #16 ; [p3+2*p4+p5 | p2+2*p3+p4] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p2+2*p3+p4+2] + + add r6, r12, r8, ror #16 ; [p5+2 | p4+2] + add r6, r6, r10, lsl #1 ; [p5+2+2*p6 | p4+2+2*p5] + uxtab16 r6, r6, r1 ; [p5+2+2*p6+p7 | p4+2+2*p5+p6] + + ; scale down + and r4, lr, r4, asr #2 ; [D|C] + and r5, lr, r5, asr #2 ; [H|G] + and r6, lr, r6, asr #2 ; [J|I] + + ldr lr, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + + pkhtb r2, r0, r6 ; [-|F|-|I] + pkhtb r12, r6, r5, asr #16 ; [-|J|-|H] + add r12, r12, r2, lsl #8 ; [F|J|I|H] + add r2, r0, r5, lsl #8 ; [H|F|G|E] + mov r12, r12, ror #24 ; [J|I|H|F] + str r12, [r3], lr + + mov r7, r11, asr #16 ; [-|-|-|B] + str r2, [r3], lr + add r7, r7, r0, lsl #16 ; [-|E|-|B] + add r7, r7, r4, asr #8 ; [-|E|D|B] + add r7, r7, r5, lsl #24 ; [G|E|D|B] + str r7, [r3], lr + + add r5, r11, r4, lsl #8 ; [D|B|C|A] + str r5, [r3] + + pop {r4-r12, pc} + + + +b_hu_pred + ldrb r4, [r1], r2 ; Left[0] + ldr r12, c00020002 + ldrb r5, [r1], r2 ; Left[1] + ldr lr, c00FF00FF + ldrb r6, [r1], r2 ; Left[2] + ldr r2, c00010001 + ldrb r7, [r1] ; Left[3] + + add r4, r4, r5, lsl #16 ; [1|0] + add r5, r5, r6, lsl #16 ; [2|1] + add r9, r6, r7, lsl #16 ; [3|2] + + uadd16 r8, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r8, r8, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r9 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + ldr r2, [sp, #44] ; dst_stride + ldr r3, [sp, #40] ; dst + and r4, lr, r4, asr #2 ; [D|C] + + add r10, r6, r7 ; [p2+p3] + add r11, r10, r7, lsl #1 ; [p2+3*p3] + add r10, r10, #1 + add r11, r11, #2 + mov r10, r10, asr #1 ; [E] + mov r11, r11, asr #2 ; [F] + + add r9, r7, r9, asr #8 ; [-|-|G|G] + add r0, r8, r4, lsl #8 ; [D|B|C|A] + add r7, r9, r9, lsl #16 ; [G|G|G|G] + + str r0, [r3], r2 + + mov r1, r8, asr #16 ; [-|-|-|B] + add r1, r1, r4, asr #8 ; [-|-|D|B] + add r1, r1, r10, lsl #16 ; [-|E|D|B] + add r1, r1, r11, lsl #24 ; [F|E|D|B] + str r1, [r3], r2 + + add r10, r11, lsl #8 ; [-|-|F|E] + add r10, r10, r9, lsl #16 ; [G|G|F|E] + str r10, [r3], r2 + + str r7, [r3] + + pop {r4-r12, pc} + + ENDP + +; constants +c00010001 + DCD 0x00010001 +c00020002 + DCD 0x00020002 +c00FF00FF + DCD 0x00FF00FF + + END diff --git a/media/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm b/media/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm new file mode 100644 index 000000000..31ef09cad --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm @@ -0,0 +1,136 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_v6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) +|vp8_short_inv_walsh4x4_v6| PROC + + stmdb sp!, {r4 - r12, lr} + + ldr r2, [r0, #0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, #8] ; [5 | 4] + ldr r5, [r0, #12] ; [7 | 6] + ldr r6, [r0, #16] ; [9 | 8] + ldr r7, [r0, #20] ; [11 | 10] + ldr r8, [r0, #24] ; [13 | 12] + ldr r9, [r0, #28] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r3, #19 ; [0] + strh r12, [r1], #32 + asr lr, r2, #19 ; [1] + strh lr, [r1], #32 + sxth r2, r2 + sxth r3, r3 + asr r2, r2, #3 ; [2] + strh r2, [r1], #32 + asr r3, r3, #3 ; [3] + strh r3, [r1], #32 + + asr r12, r5, #19 ; [4] + strh r12, [r1], #32 + asr lr, r4, #19 ; [5] + strh lr, [r1], #32 + sxth r4, r4 + sxth r5, r5 + asr r4, r4, #3 ; [6] + strh r4, [r1], #32 + asr r5, r5, #3 ; [7] + strh r5, [r1], #32 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r12, r7, #19 ; [8] + strh r12, [r1], #32 + asr lr, r6, #19 ; [9] + strh lr, [r1], #32 + sxth r6, r6 + sxth r7, r7 + asr r6, r6, #3 ; [10] + strh r6, [r1], #32 + asr r7, r7, #3 ; [11] + strh r7, [r1], #32 + + asr r12, r9, #19 ; [12] + strh r12, [r1], #32 + asr lr, r8, #19 ; [13] + strh lr, [r1], #32 + sxth r8, r8 + sxth r9, r9 + asr r8, r8, #3 ; [14] + strh r8, [r1], #32 + asr r9, r9, #3 ; [15] + strh r9, [r1], #32 + + ldmia sp!, {r4 - r12, pc} + ENDP ; |vp8_short_inv_walsh4x4_v6| + + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm new file mode 100644 index 000000000..1cbbbcdef --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm @@ -0,0 +1,1282 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_horizontal_edge_armv6| + EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_vertical_edge_armv6| + EXPORT |vp8_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Hnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp8_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp8_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp8_filter += 1 + shadd8 r7, r7, lr ; vp8_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp8_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBHnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Vnext8| + + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask + + ;modify code for vp8 -- Filter1 = vp8_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) + qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp8_filter + + qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) + qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + pld [src, #23] ; preload for next block + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + pld [src, #23] + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + pld [src, #23] + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + pld [src, #23] + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBVnext8| + ; vp8_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp8_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp8_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + pld [src, #23] ; preload for next block + ldrne r6, [src], pstep ; load source data + pld [src, #23] + ldrne r7, [src], pstep + pld [src, #23] + ldrne r8, [src], pstep + pld [src, #23] + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm new file mode 100644 index 000000000..5e00cf01b --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; blimit + ldr r3, [src, -pstep, lsl #1] ; p1 + ldr r4, [src, -pstep] ; p0 + ldr r5, [src] ; q0 + ldr r6, [src, pstep] ; q1 + orr r12, r12, r12, lsl #8 ; blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time + mov lr, #0 ; need 0 in a couple places + +|simple_hnext8| + ; vp8_simple_filter_mask() + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + orr r10, r10, r11 ; abs(p0 - q0) + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r8, #0 + usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags + sel r10, r8, lr ; filter mask: F or 0 + cmp r10, #0 + beq simple_hskip_filter ; skip filtering if all masks are 0x00 + + ;vp8_simple_filter() + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r7, c0x04040404 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r8, c0x03030303 + qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, r10 ; vp8_filter &= mask + + qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4 + qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3 + + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr ; Filter1 >>= 3 + shadd8 r8 , r8 , lr ; Filter2 >>= 3 + + qsub8 r5 ,r5, r7 ; u = q0 - Filter1 + qadd8 r4, r4, r8 ; u = p0 + Filter2 + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src] ; store oq0 result + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src, -pstep] ; store op0 result + +|simple_hskip_filter| + subs r9, r9, #1 + addne src, src, #4 ; next row + + ldrne r3, [src, -pstep, lsl #1] ; p1 + ldrne r4, [src, -pstep] ; p0 + ldrne r5, [src] ; q0 + ldrne r6, [src, pstep] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; r12: blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #8 + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrh r4, [src], pstep + orr r12, r12, r12, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + pld [src, #23] + ldrh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + mov r11, #4 ; double the count. we're doing 4 at a time + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + mov r8, #0 + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + + usub8 r7, r12, r7 ; compare to flimit + sel lr, r10, r8 ; filter mask + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + + qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 + ldr r9, c0x03030303 ; r9 = 3 + + qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, lr ; vp8_filter &= mask + + qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3 + qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4 + + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 ; Filter2 >>= 3 + shadd8 r3 , r3 , r8 ; Filter1 >>= 3 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r9 ; u = p0 + Filter2 + qsub8 r5, r5, r3 ; u = q0 - Filter1 + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + pld [src, #23] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 + + END diff --git a/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm new file mode 100644 index 000000000..e81aef53d --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm @@ -0,0 +1,273 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + add lr, sp, #4 ;point to temporary buffer + beq skip_firstpass_filter + +;first-pass filter + adr r12, filter8_coeff + sub r0, r0, r1, lsl #1 + + add r3, r1, #10 ; preload next low + pld [r0, r3] + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [lr], #20 ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [lr], #20 + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + sub lr, lr, #158 + + add r0, r0, r1 ; move to next input line + + add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier + pld [r0, r11] + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + ldr r3, [sp], #4 ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + adr r12, filter8_coeff + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [lr], #20 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [lr], #20 + ldrb r7, [r0], #1 + strh r6, [lr], #20 + ldrb r8, [r0], #1 + strh r7, [lr], #20 + ldrb r9, [r0], #1 + strh r8, [lr], #20 + ldrb r10, [r0], #1 + strh r9, [lr], #20 + ldrb r11, [r0], #1 + strh r10, [lr], #20 + add r0, r0, r1 ; move to next input line + strh r11, [lr], #20 + + sub lr, lr, #158 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm new file mode 100644 index 000000000..3668dc517 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -0,0 +1,182 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_h_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_h_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm new file mode 100644 index 000000000..b4e0959d1 --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_hv_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_hv_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm new file mode 100644 index 000000000..10863e2ec --- /dev/null +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -0,0 +1,184 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance_halfpixvar16x16_v_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance_halfpixvar16x16_v_armv6| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/media/libvpx/vp8/common/arm/bilinearfilter_arm.c b/media/libvpx/vp8/common/arm/bilinearfilter_arm.c new file mode 100644 index 000000000..799c8bd96 --- /dev/null +++ b/media/libvpx/vp8/common/arm/bilinearfilter_arm.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include <math.h> +#include "vp8/common/filter.h" +#include "bilinearfilter_arm.h" + +void vp8_filter_block2d_bil_armv6 +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + unsigned short FData[36*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp8_bilinear_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp8_bilinear_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); +} + +void vp8_bilinear_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/media/libvpx/vp8/common/arm/bilinearfilter_arm.h b/media/libvpx/vp8/common/arm/bilinearfilter_arm.h new file mode 100644 index 000000000..6b84e6f3b --- /dev/null +++ b/media/libvpx/vp8/common/arm/bilinearfilter_arm.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ +#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_filter_block2d_bil_first_pass_armv6 +( + const unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +extern void vp8_filter_block2d_bil_second_pass_armv6 +( + const unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ diff --git a/media/libvpx/vp8/common/arm/dequantize_arm.c b/media/libvpx/vp8/common/arm/dequantize_arm.c new file mode 100644 index 000000000..1f8157f0b --- /dev/null +++ b/media/libvpx/vp8/common/arm/dequantize_arm.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8/common/blockd.h" + +#if HAVE_MEDIA +extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); + +void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) +{ + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + vp8_dequantize_b_loop_v6(Q, DQC, DQ); +} +#endif diff --git a/media/libvpx/vp8/common/arm/filter_arm.c b/media/libvpx/vp8/common/arm/filter_arm.c new file mode 100644 index 000000000..d6a6781d8 --- /dev/null +++ b/media/libvpx/vp8/common/arm/filter_arm.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include <math.h> +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" + +extern void vp8_filter_block2d_first_pass_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +// 8x8 +extern void vp8_filter_block2d_first_pass_8x8_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +// 16x16 +extern void vp8_filter_block2d_first_pass_16x16_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp8_filter +); + +extern void vp8_filter_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter4_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + +extern void vp8_filter_block2d_first_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + + +extern void vp8_filter_block2d_second_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp8_filter +); + +#if HAVE_MEDIA +void vp8_sixtap_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* Vfilter is null. First pass only */ + if (xoffset && !yoffset) + { + /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); + vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/ + + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); + } + else + { + /* Vfilter is a 4 tap filter */ + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + /* Vfilter is 6 tap filter */ + else + { + vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + } +} + +void vp8_sixtap_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } + else + { + vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } + } +} + + +void vp8_sixtap_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED(4, short, FData[24*16]); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) + { + vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) + { + vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); + } + else + { + if (yoffset & 0x1) + { + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + else + { + vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + } + +} +#endif diff --git a/media/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 000000000..5840c2bba --- /dev/null +++ b/media/libvpx/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#if HAVE_MEDIA +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); +#endif + +#if HAVE_NEON +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif + +#if HAVE_MEDIA +/* ARMV6/MEDIA loopfilter functions*/ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); +} +#endif + +#if HAVE_NEON +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); +} +#endif diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 000000000..9824a3193 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const uint8_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8; + uint16x8_t q1u16, q2u16; + uint16x8_t q7u16, q8u16, q9u16; + uint64x2_t q4u64, q5u64; + uint64x1_t d12u64; + uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2; + + if (xoffset == 0) { // skip_1stpass_filter + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0); + src_ptr += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1); + src_ptr += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + } else { + d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d6u8 = vld1_u8(src_ptr); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); + q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); + d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), + vreinterpret_u32_u8(vget_high_u8(q1u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), + vreinterpret_u32_u8(vget_high_u8(q2u8))); + d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), + vreinterpret_u32_u64(vget_high_u64(q4u64))); + d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), + vreinterpret_u32_u64(vget_high_u64(q5u64))); + + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q9u16 = vmull_u8(d6u8, d0u8); + + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8); + q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8); + + d28u8 = vqrshrn_n_u16(q7u16, 7); + d29u8 = vqrshrn_n_u16(q8u16, 7); + d30u8 = vqrshrn_n_u16(q9u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d28u8, d0u8); + q2u16 = vmull_u8(d29u8, d0u8); + + d26u8 = vext_u8(d28u8, d29u8, 4); + d27u8 = vext_u8(d29u8, d30u8, 4); + + q1u16 = vmlal_u8(q1u16, d26u8, d1u8); + q2u16 = vmlal_u8(q2u16, d27u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + } + return; +} + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 000000000..deced115c --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 000000000..ad5f41d7d --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 000000000..58e11922c --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 000000000..54e709dd3 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 000000000..fb327a726 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +/* place these declarations here because we don't want to maintain them + * outside of this scope + */ +void idct_dequant_full_2x_neon(short *q, short *dq, + unsigned char *dst, int stride); +void idct_dequant_0_2x_neon(short *q, short dq, + unsigned char *dst, int stride); + + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dst, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q+32, dq, dst+8, stride); + else + idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride); + } + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + dstu += 4*stride; + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } + + q += 32; + dstv += 4*stride; + + if (((short *)(eobs))[3]) + { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 000000000..967c32280 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32, d4s32; + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c new file mode 100644 index 000000000..a60ed46b7 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +void idct_dequant_full_2x_neon( + int16_t *q, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), + vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), + vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), + vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), + vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 000000000..6ea9dd712 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c new file mode 100644 index 000000000..9d6807af7 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +static INLINE void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#ifdef VPX_INCOMPATIBLE_GCC + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#else + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#endif // VPX_INCOMPATIBLE_GCC +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 000000000..b25686ffb --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), + vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 000000000..e1c8609e3 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], + result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#else +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#endif // VPX_INCOMPATIBLE_GCC + + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), + vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), + vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), + vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), + vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#else +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#endif // VPX_INCOMPATIBLE_GCC + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p, d0u8x4); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p, d1u8x4); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), + vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 000000000..5351f4be6 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_mbloop_filter_neon( + uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16 , 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/reconintra_neon.c b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c new file mode 100644 index 000000000..af52cd5ea --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { + const int mode = x->mode_info_context->mbmi.mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x16_t v_expected_dc = vdupq_n_u8(128); + + if (shift) { + unsigned int average = 0; + int expected_dc; + if (x->up_available) { + const uint8x16_t v_above = vld1q_u8(yabove_row); + const uint16x8_t a = vpaddlq_u8(v_above); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + average = vget_lane_u32(d, 0); + } + if (x->left_available) { + for (i = 0; i < 16; ++i) { + average += yleft[0]; + yleft += left_stride; + } + } + shift += 3; + expected_dc = (average + (1 << (shift - 1))) >> shift; + v_expected_dc = vmovq_n_u8((uint8_t)expected_dc); + } + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_expected_dc); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_above); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 16; ++i) { + const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]); + yleft += left_stride; + vst1q_u8(ypred_ptr, v_yleft); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]); + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]); + const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft); + const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft); + const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo), + vreinterpretq_s16_u16(v_ytop_left)); + const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi), + vreinterpretq_s16_u16(v_ytop_left)); + const uint8x8_t pred_lo = vqmovun_s16(b_lo); + const uint8x8_t pred_hi = vqmovun_s16(b_hi); + + vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi)); + ypred_ptr += y_stride; + yleft += left_stride; + } + } + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { + const int mode = x->mode_info_context->mbmi.uv_mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x8_t v_expected_udc = vdup_n_u8(128); + uint8x8_t v_expected_vdc = vdup_n_u8(128); + + if (shift) { + unsigned int average_u = 0; + unsigned int average_v = 0; + int expected_udc; + int expected_vdc; + if (x->up_available) { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove)); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0); + average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2); + } + if (x->left_available) { + for (i = 0; i < 8; ++i) { + average_u += uleft[0]; + uleft += left_stride; + average_v += vleft[0]; + vleft += left_stride; + } + } + shift += 2; + expected_udc = (average_u + (1 << (shift - 1))) >> shift; + expected_vdc = (average_v + (1 << (shift - 1))) >> shift; + v_expected_udc = vmov_n_u8((uint8_t)expected_udc); + v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc); + } + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_expected_udc); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_expected_vdc); + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_uabove); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vabove); + vpred_ptr += pred_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]); + uleft += left_stride; + vleft += left_stride; + vst1_u8(upred_ptr, v_uleft); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vleft); + vpred_ptr += pred_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]); + const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]); + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]); + const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft); + const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft); + const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u), + vreinterpretq_s16_u16(v_utop_left)); + const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v), + vreinterpretq_s16_u16(v_vtop_left)); + const uint8x8_t pred_u = vqmovun_s16(b_u); + const uint8x8_t pred_v = vqmovun_s16(b_v); + + vst1_u8(upred_ptr, pred_u); + vst1_u8(vpred_ptr, pred_v); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + uleft += left_stride; + vleft += left_stride; + } + } + break; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 000000000..373afa6ed --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_neon( + int16_t *input, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), + vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 000000000..4c2efc92b --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1754 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/mem.h" + +static const int8_t vp8_sub_pel_filters[8][8] = { + {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ + {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -9, 93, 50, -6, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {0, -6, 50, 93, -9, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -1, 12, 123, -6, 0, 0, 0}, +}; + +void vp8_sixtap_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; + uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; + uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; + uint32x2x2_t d0u32x2, d1u32x2; + + if (xoffset == 0) { // secondpass_filter4x4_only + uint32x2_t d27u32 = vdup_n_u32(0); + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + uint32x2_t d31u32 = vdup_n_u32(0); + + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); + src += src_pixels_per_line; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); + src += src_pixels_per_line; + d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); + + d27u8 = vreinterpret_u8_u32(d27u32); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + d31u8 = vreinterpret_u8_u32(d31u32); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + + // keep original src data in q4 q6 + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + + d27u8 = vqrshrun_n_s16(q7s16, 7); + d28u8 = vqrshrun_n_s16(q8s16, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + return; + } + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q11u8 = vld1q_u8(src); + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + q12u16 = vmull_u8(d31u8, d5u8); + + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + q11u16 = vmull_u8(d31u8, d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + q12s16 = vqaddq_s16(q12s16, q11s16); + + d29u8 = vqrshrun_n_s16(q7s16, 7); + d30u8 = vqrshrun_n_s16(q8s16, 7); + d31u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 4x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; +} + +void vp8_sixtap_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; +} + +void vp8_sixtap_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) + return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; +} + +void vp8_sixtap_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; i++) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + d14u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; i++) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c new file mode 100644 index 000000000..974d3b653 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c @@ -0,0 +1,1017 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +static const uint16_t bilinear_taps_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +unsigned int vp8_sub_pixel_variance16x16_neon_func( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + int i; + DECLARE_ALIGNED(16, unsigned char, tmp[528]); + unsigned char *tmpp; + unsigned char *tmpp2; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + tmpp2 = tmp + 272; + tmpp = tmp; + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } else if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp2, q7u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q8u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q9u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q10u8); + tmpp2 += 16; + } + } else { + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + tmpp = tmp; + tmpp2 = tmpp + 272; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } + + // sub_pixel_variance16x16_neon + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + tmpp = tmp + 272; + for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop + q0u8 = vld1q_u8(tmpp); + tmpp += 16; + q1u8 = vld1q_u8(tmpp); + tmpp += 16; + q2u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + q3u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); + q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_h_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; + uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q11u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q12u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q13u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q14u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q1u8 = vextq_u8(q0u8, q1u8, 1); + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); + q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); + q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); + q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); + q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); + q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); + + d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); + d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); + q9s32 = vmlal_s16(q9s32, d8s16, d8s16); + q10s32 = vmlal_s16(q10s32, d9s16, d9s16); + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); + q9s32 = vmlal_s16(q9s32, d10s16, d10s16); + q10s32 = vmlal_s16(q10s32, d11s16, d11s16); + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); + q9s32 = vmlal_s16(q9s32, d12s16, d12s16); + q10s32 = vmlal_s16(q10s32, d13s16, d13s16); + d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); + d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); + q9s32 = vmlal_s16(q9s32, d14s16, d14s16); + q10s32 = vmlal_s16(q10s32, d15s16, d15s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_v_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q15u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + + q1u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q4u8); + q4u8 = vrhaddq_u8(q4u8, q6u8); + q6u8 = vrhaddq_u8(q6u8, q15u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d8u8 = vget_low_u8(q4u8); + d9u8 = vget_high_u8(q4u8); + d12u8 = vget_low_u8(q6u8); + d13u8 = vget_high_u8(q6u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); + q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); + q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); + q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); + q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); + q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + + q0u8 = q15u8; + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_hv_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; + int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; + int32x4_t q13s32, q14s32, q15s32; + int64x2_t q0s64, q1s64, q5s64; + + q13s32 = vdupq_n_s32(0); + q14s32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q1u8 = vextq_u8(q0u8, q1u8, 1); + q0u8 = vrhaddq_u8(q0u8, q1u8); + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q8u8 = vld1q_u8(src_ptr); + q9u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + q9u8 = vextq_u8(q8u8, q9u8, 1); + + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + q4u8 = vrhaddq_u8(q8u8, q9u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q1u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q3u8); + q3u8 = vrhaddq_u8(q3u8, q4u8); + + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q6u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q8u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); + q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); + q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); + q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); + q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); + q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); + q14s32 = vmlal_s16(q14s32, d18s16, d18s16); + q15s32 = vmlal_s16(q15s32, d19s16, d19s16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); + q14s32 = vmlal_s16(q14s32, d20s16, d20s16); + q15s32 = vmlal_s16(q15s32, d21s16, d21s16); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); + q14s32 = vmlal_s16(q14s32, d22s16, d22s16); + q15s32 = vmlal_s16(q15s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); + q14s32 = vmlal_s16(q14s32, d24s16, d24s16); + q15s32 = vmlal_s16(q15s32, d25s16, d25s16); + + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); + q14s32 = vmlal_s16(q14s32, d0s16, d0s16); + q15s32 = vmlal_s16(q15s32, d1s16, d1s16); + + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); + q14s32 = vmlal_s16(q14s32, d2s16, d2s16); + q15s32 = vmlal_s16(q15s32, d3s16, d3s16); + + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); + q14s32 = vmlal_s16(q14s32, d10s16, d10s16); + q15s32 = vmlal_s16(q15s32, d11s16, d11s16); + + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); + q14s32 = vmlal_s16(q14s32, d12s16, d12s16); + q15s32 = vmlal_s16(q15s32, d13s16, d13s16); + + q0u8 = q4u8; + } + + q15s32 = vaddq_s32(q14s32, q15s32); + q0s64 = vpaddlq_s32(q13s32); + q1s64 = vpaddlq_s32(q15s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +#define FILTER_BITS 7 + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (8 * 8)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint16_t *vpx_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp8_sub_pixel_variance8x8_neon( + const unsigned char *src, + int src_stride, + int xoffset, + int yoffset, + const unsigned char *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]); + DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); + if (xoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8, + 8, bilinear_taps_coeff[yoffset]); + } else if (yoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, 1, + 9, 8, + bilinear_taps_coeff[xoffset]); + } else { + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, + 9, 8, + bilinear_taps_coeff[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, + 8, bilinear_taps_coeff[yoffset]); + } + return variance8x8_neon(temp2, 8, dst, dst_stride, sse); +} diff --git a/media/libvpx/vp8/common/arm/variance_arm.c b/media/libvpx/vp8/common/arm/variance_arm.c new file mode 100644 index 000000000..0f293f03d --- /dev/null +++ b/media/libvpx/vp8/common/arm/variance_arm.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/variance.h" +#include "vp8/common/filter.h" + +// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder +#if CONFIG_VP8_ENCODER + +#if HAVE_MEDIA +#include "vp8/common/arm/bilinearfilter_arm.h" + +unsigned int vp8_sub_pixel_variance8x8_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[10*8]; + unsigned char second_pass[8*8]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 9, 8, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 8, 8, 8, VFilter); + + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); +} + +unsigned int vp8_sub_pixel_variance16x16_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[36*16]; + unsigned char second_pass[20*16]; + const short *HFilter, *VFilter; + unsigned int var; + + if (xoffset == 4 && yoffset == 0) + { + var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 0 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else if (xoffset == 4 && yoffset == 4) + { + var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, sse); + } + else + { + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 16, 16, 16, VFilter); + + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); + } + return var; +} + +#endif // HAVE_MEDIA + + +#if HAVE_NEON + +extern unsigned int vp8_sub_pixel_variance16x16_neon_func +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); + +unsigned int vp8_sub_pixel_variance16x16_neon +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + if (xoffset == 4 && yoffset == 0) + return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 0 && yoffset == 4) + return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 4 && yoffset == 4) + return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else + return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); +} + +#endif // HAVE_NEON +#endif // CONFIG_VP8_ENCODER diff --git a/media/libvpx/vp8/common/blockd.c b/media/libvpx/vp8/common/blockd.c new file mode 100644 index 000000000..1fc3cd0ca --- /dev/null +++ b/media/libvpx/vp8/common/blockd.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" + +const unsigned char vp8_block2left[25] = +{ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; +const unsigned char vp8_block2above[25] = +{ + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 +}; diff --git a/media/libvpx/vp8/common/blockd.h b/media/libvpx/vp8/common/blockd.h new file mode 100644 index 000000000..192108a06 --- /dev/null +++ b/media/libvpx/vp8/common/blockd.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_BLOCKD_H_ +#define VP8_COMMON_BLOCKD_H_ + +void vpx_log(const char *format, ...); + +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "mv.h" +#include "treecoder.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*#define DCPRED 1*/ +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +typedef struct +{ + int r, c; +} POS; + +#define PLANE_TYPE_Y_NO_DC 0 +#define PLANE_TYPE_Y2 1 +#define PLANE_TYPE_UV 2 +#define PLANE_TYPE_Y_WITH_DC 3 + + +typedef char ENTROPY_CONTEXT; +typedef struct +{ + ENTROPY_CONTEXT y1[4]; + ENTROPY_CONTEXT u[2]; + ENTROPY_CONTEXT v[2]; + ENTROPY_CONTEXT y2; +} ENTROPY_CONTEXT_PLANES; + +extern const unsigned char vp8_block2left[25]; +extern const unsigned char vp8_block2above[25]; + +#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + Dest = (A)+(B); + + +typedef enum +{ + KEY_FRAME = 0, + INTER_FRAME = 1 +} FRAME_TYPE; + +typedef enum +{ + DC_PRED, /* average of above and left pixels */ + V_PRED, /* vertical prediction */ + H_PRED, /* horizontal prediction */ + TM_PRED, /* Truemotion prediction */ + B_PRED, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +/* Macroblock level features */ +typedef enum +{ + MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */ + MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */ + MB_LVL_MAX = 2 /* Number of MB level features supported */ + +} MB_LVL_FEATURES; + +/* Segment Feature Masks */ +#define SEGMENT_ALTQ 0x01 +#define SEGMENT_ALT_LF 0x02 + +#define VP8_YMODES (B_PRED + 1) +#define VP8_UV_MODES (TM_PRED + 1) + +#define VP8_MVREFS (1 + SPLITMV - NEARESTMV) + +typedef enum +{ + B_DC_PRED, /* average of above and left pixels */ + B_TM_PRED, + + B_VE_PRED, /* vertical prediction */ + B_HE_PRED, /* horizontal prediction */ + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ +#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info +{ + B_PREDICTION_MODE as_mode; + int_mv mv; +}; + +typedef enum +{ + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct +{ + uint8_t mode, uv_mode; + uint8_t ref_frame; + uint8_t is_4x4; + int_mv mv; + + uint8_t partitioning; + uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ + uint8_t need_to_clamp_mvs; + uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */ +} MB_MODE_INFO; + +typedef struct modeinfo +{ + MB_MODE_INFO mbmi; + union b_mode_info bmi[16]; +} MODE_INFO; + +#if CONFIG_MULTI_RES_ENCODING +/* The mb-level information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + int dissim; /* dissimilarity level of the macroblock */ +} LOWER_RES_MB_INFO; + +/* The frame-level information needed to be stored for higher-resolution + * encoder */ +typedef struct +{ + FRAME_TYPE frame_type; + int is_frame_dropped; + // The frame rate for the lowest resolution. + double low_res_framerate; + /* The frame number of each reference frames */ + unsigned int low_res_ref_frames[MAX_REF_FRAMES]; + // The video frame counter value for the key frame, for lowest resolution. + unsigned int key_frame_counter_value; + LOWER_RES_MB_INFO *mb_info; +} LOWER_RES_FRAME_INFO; +#endif + +typedef struct blockd +{ + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *dequant; + + int offset; + char *eob; + + union b_mode_info bmi; +} BLOCKD; + +typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +typedef struct macroblockd +{ + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, char, eobs[25]); + + DECLARE_ALIGNED(16, short, dequant_y1[16]); + DECLARE_ALIGNED(16, short, dequant_y1_dc[16]); + DECLARE_ALIGNED(16, short, dequant_y2[16]); + DECLARE_ALIGNED(16, short, dequant_uv[16]); + + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ + BLOCKD block[25]; + int fullpixel_mask; + + YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ + YV12_BUFFER_CONFIG dst; + + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + + unsigned char *recon_above[3]; + unsigned char *recon_left[3]; + int recon_left_stride[2]; + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; + ENTROPY_CONTEXT_PLANES *left_context; + + /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segement_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */ + + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */ + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + + + vp8_subpix_fn_t subpixel_predict; + vp8_subpix_fn_t subpixel_predict8x4; + vp8_subpix_fn_t subpixel_predict8x8; + vp8_subpix_fn_t subpixel_predict16x16; + + void *current_bc; + + int corrupted; + +#if ARCH_X86 || ARCH_X86_64 + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]); +#endif +} MACROBLOCKD; + + +extern void vp8_build_block_doffsets(MACROBLOCKD *x); +extern void vp8_setup_block_dptrs(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_BLOCKD_H_ diff --git a/media/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/vp8/common/coefupdateprobs.h new file mode 100644 index 000000000..d96a19e74 --- /dev/null +++ b/media/libvpx/vp8/common/coefupdateprobs.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ +#define VP8_COMMON_COEFUPDATEPROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by entropy.c */ + +const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/media/libvpx/vp8/common/common.h b/media/libvpx/vp8/common/common.h new file mode 100644 index 000000000..ba3d9f54d --- /dev/null +++ b/media/libvpx/vp8/common/common.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_COMMON_H_ +#define VP8_COMMON_COMMON_H_ + +#include <assert.h> + +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp8_copy( Dest, Src) { \ + assert( sizeof( Dest) == sizeof( Src)); \ + memcpy( Dest, Src, sizeof( Src)); \ + } + +/* Use this for variably-sized arrays. */ + +#define vp8_copy_array( Dest, Src, N) { \ + assert( sizeof( *Dest) == sizeof( *Src)); \ + memcpy( Dest, Src, N * sizeof( *Src)); \ + } + +#define vp8_zero( Dest) memset( &Dest, 0, sizeof( Dest)); + +#define vp8_zero_array( Dest, N) memset( Dest, 0, N * sizeof( *Dest)); + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COMMON_H_ diff --git a/media/libvpx/vp8/common/copy_c.c b/media/libvpx/vp8/common/copy_c.c new file mode 100644 index 000000000..e3392913f --- /dev/null +++ b/media/libvpx/vp8/common/copy_c.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <string.h> + +#include "./vp8_rtcd.h" +#include "vpx/vpx_integer.h" + +/* Copy 2 macroblocks to a buffer */ +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, + unsigned char *dst_ptr, int dst_stride, + int height) +{ + int r; + + for (r = 0; r < height; r++) + { + memcpy(dst_ptr, src_ptr, 32); + + src_ptr += src_stride; + dst_ptr += dst_stride; + + } +} diff --git a/media/libvpx/vp8/common/debugmodes.c b/media/libvpx/vp8/common/debugmodes.c new file mode 100644 index 000000000..159fddc6a --- /dev/null +++ b/media/libvpx/vp8/common/debugmodes.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <stdio.h> +#include "blockd.h" + + +void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame) +{ + + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + /* print out the macroblock Y modes */ + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode); + else + fprintf(mvs, "xx "); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + + /* print out the block modes */ + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + + fclose(mvs); +} diff --git a/media/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/vp8/common/default_coef_probs.h new file mode 100644 index 000000000..4d69e4be6 --- /dev/null +++ b/media/libvpx/vp8/common/default_coef_probs.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropy.c*/ + + +static const vp8_prob default_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = +{ + { /* Block Type ( 0 ) */ + { /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 1 ) */ + { /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 2 ) */ + { /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 3 ) */ + { /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + } + } +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/media/libvpx/vp8/common/dequantize.c b/media/libvpx/vp8/common/dequantize.c new file mode 100644 index 000000000..f8b04fa4e --- /dev/null +++ b/media/libvpx/vp8/common/dequantize.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequantize_b_c(BLOCKD *d, short *DQC) +{ + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + for (i = 0; i < 16; i++) + { + DQ[i] = Q[i] * DQC[i]; + } +} + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride) +{ + int i; + + for (i = 0; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); + + memset(input, 0, 32); + +} diff --git a/media/libvpx/vp8/common/entropy.c b/media/libvpx/vp8/common/entropy.c new file mode 100644 index 000000000..c00e565f0 --- /dev/null +++ b/media/libvpx/vp8/common/entropy.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" +#include "blockd.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +#include "coefupdateprobs.h" + +DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = +{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; + +DECLARE_ALIGNED(16, const unsigned char, + vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = +{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}; + +DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = +{ + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = +{ + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 +}; + +/* vp8_default_zig_zag_mask generated with: + + void vp8_init_scan_order_mask() + { + int i; + + for (i = 0; i < 16; i++) + { + vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i; + } + + } +*/ +DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) = +{ + 1, 2, 32, 64, + 4, 16, 128, 4096, + 8, 256, 2048, 8192, + 512, 1024, 16384, -32768 +}; + +const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6}; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +/* vp8_coef_encodings generated with: + vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree); +*/ +vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] = +{ + {2, 2}, + {6, 3}, + {28, 5}, + {58, 6}, + {59, 6}, + {60, 6}, + {61, 6}, + {124, 7}, + {125, 7}, + {126, 7}, + {127, 7}, + {0, 1} +}; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const vp8_prob Pcat1[] = { 159}; +static const vp8_prob Pcat2[] = { 165, 145}; +static const vp8_prob Pcat3[] = { 173, 148, 140}; +static const vp8_prob Pcat4[] = { 176, 155, 140, 135}; +static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const vp8_prob Pcat6[] = +{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129}; + + +/* tree index tables generated with: + + void init_bit_tree(vp8_tree_index *p, int n) + { + int i = 0; + + while (++i < n) + { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; + } + + void init_bit_trees() + { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 11); + } +*/ + +static const vp8_tree_index cat1[2] = { 0, 0 }; +static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; +static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; +static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; +static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, + 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 }; + +const vp8_extra_bit_struct vp8_extra_bits[12] = +{ + { 0, 0, 0, 0}, + { 0, 0, 0, 1}, + { 0, 0, 0, 2}, + { 0, 0, 0, 3}, + { 0, 0, 0, 4}, + { cat1, Pcat1, 1, 5}, + { cat2, Pcat2, 2, 7}, + { cat3, Pcat3, 3, 11}, + { cat4, Pcat4, 4, 19}, + { cat5, Pcat5, 5, 35}, + { cat6, Pcat6, 11, 67}, + { 0, 0, 0, 0} +}; + +#include "default_coef_probs.h" + +void vp8_default_coef_probs(VP8_COMMON *pc) +{ + memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs)); +} + diff --git a/media/libvpx/vp8/common/entropy.h b/media/libvpx/vp8/common/entropy.h new file mode 100644 index 000000000..a90bab4ba --- /dev/null +++ b/media/libvpx/vp8/common/entropy.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPY_H_ +#define VP8_COMMON_ENTROPY_H_ + +#include "treecoder.h" +#include "blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ + +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 + +extern const vp8_tree_index vp8_coef_tree[]; + +extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct +{ + vp8_tree_p tree; + const vp8_prob *prob; + int Len; + int base_val; +} vp8_extra_bit_struct; + +extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 2048 + + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ + +#define BLOCK_TYPES 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +# define PREV_COEF_CONTEXTS 3 + +extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]); + +extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + +struct VP8Common; +void vp8_default_coef_probs(struct VP8Common *); + +extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); +extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; + +void vp8_coef_tree_initialize(void); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPY_H_ diff --git a/media/libvpx/vp8/common/entropymode.c b/media/libvpx/vp8/common/entropymode.c new file mode 100644 index 000000000..8981a8d3c --- /dev/null +++ b/media/libvpx/vp8/common/entropymode.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#define USE_PREBUILT_TABLES + +#include "entropymode.h" +#include "entropy.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp8_entropymodedata.h" + +int vp8_mv_cont(const int_mv *l, const int_mv *a) +{ + int lez = (l->as_int == 0); + int aez = (a->as_int == 0); + int lea = (l->as_int == a->as_int); + + if (lea && lez) + return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) + return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) + return SUBMVREF_ABOVE_ZED; + + if (lez) + return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25}; + +const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] = +{ + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1 , 34 }, + { 208, 1 , 1 } +}; + + + +const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] = +{ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 1, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + }, + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + } +}; + +const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16}; + +const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150}; + + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */ +{ + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +}; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ + +const vp8_tree_index vp8_ymode_tree[8] = +{ + -DC_PRED, 2, + 4, 6, + -V_PRED, -H_PRED, + -TM_PRED, -B_PRED +}; + +const vp8_tree_index vp8_kf_ymode_tree[8] = +{ + -B_PRED, 2, + 4, 6, + -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_uv_mode_tree[6] = +{ + -DC_PRED, 2, + -V_PRED, 4, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_mbsplit_tree[6] = +{ + -3, 2, + -2, 4, + -0, -1 +}; + +const vp8_tree_index vp8_mv_ref_tree[8] = +{ + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, 6, + -NEWMV, -SPLITMV +}; + +const vp8_tree_index vp8_sub_mv_ref_tree[6] = +{ + -LEFT4X4, 2, + -ABOVE4X4, 4, + -ZERO4X4, -NEW4X4 +}; + +const vp8_tree_index vp8_small_mvtree [14] = +{ + 2, 8, + 4, 6, + -0, -1, + -2, -3, + 10, 12, + -4, -5, + -6, -7 +}; + +void vp8_init_mbmode_probs(VP8_COMMON *x) +{ + memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob)); + memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob)); + memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); +} + +void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1]) +{ + memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +} + diff --git a/media/libvpx/vp8/common/entropymode.h b/media/libvpx/vp8/common/entropymode.h new file mode 100644 index 000000000..81bdfc4b8 --- /dev/null +++ b/media/libvpx/vp8/common/entropymode.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPYMODE_H_ +#define VP8_COMMON_ENTROPYMODE_H_ + +#include "onyxc_int.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum +{ + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +typedef int vp8_mbsplit[16]; + +#define VP8_NUMMBSPLITS 4 + +extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS]; + +extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */ + +extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1]; + +extern int vp8_mv_cont(const int_mv *l, const int_mv *a); +#define SUBMVREF_COUNT 5 +extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1]; + + +extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES]; + + +extern const vp8_tree_index vp8_bmode_tree[]; + +extern const vp8_tree_index vp8_ymode_tree[]; +extern const vp8_tree_index vp8_kf_ymode_tree[]; +extern const vp8_tree_index vp8_uv_mode_tree[]; + +extern const vp8_tree_index vp8_mbsplit_tree[]; +extern const vp8_tree_index vp8_mv_ref_tree[]; +extern const vp8_tree_index vp8_sub_mv_ref_tree[]; + +extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES]; +extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES]; +extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS]; +extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS]; + +extern const vp8_tree_index vp8_small_mvtree[]; + +extern const struct vp8_token_struct vp8_small_mvencodings[8]; + +/* Key frame default mode probs */ +extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES] +[VP8_BINTRAMODES-1]; +extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1]; +extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1]; + +void vp8_init_mbmode_probs(VP8_COMMON *x); +void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); +void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPYMODE_H_ diff --git a/media/libvpx/vp8/common/entropymv.c b/media/libvpx/vp8/common/entropymv.c new file mode 100644 index 000000000..e5df1f095 --- /dev/null +++ b/media/libvpx/vp8/common/entropymv.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropymv.h" + +const MV_CONTEXT vp8_mv_update_probs[2] = +{ + {{ + 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 + }}, + {{ + 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 + }} +}; +const MV_CONTEXT vp8_default_mv_context[2] = +{ + {{ + /* row */ + 162, /* is short */ + 128, /* sign */ + 225, 146, 172, 147, 214, 39, 156, /* short tree */ + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */ + }}, + + + + {{ + /* same for column */ + 164, /* is short */ + 128, + 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */ + + }} +}; diff --git a/media/libvpx/vp8/common/entropymv.h b/media/libvpx/vp8/common/entropymv.h new file mode 100644 index 000000000..42840d58a --- /dev/null +++ b/media/libvpx/vp8/common/entropymv.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPYMV_H_ +#define VP8_COMMON_ENTROPYMV_H_ + +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum +{ + mv_max = 1023, /* max absolute value of a MV component */ + MVvals = (2 * mv_max) + 1, /* # possible values "" */ + mvfp_max = 255, /* max absolute value of a full pixel MV component */ + MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */ + + mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ + mvnum_short = 8, /* magnitudes 0 through 7 */ + + /* probability offsets for coding each MV component */ + + mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ + MVPsign, /* sign for non-zero */ + MVPshort, /* 8 short values = 7-position tree */ + + MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ + MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ +}; + +typedef struct mv_context +{ + vp8_prob prob[MVPcount]; /* often come in row, col pairs */ +} MV_CONTEXT; + +extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPYMV_H_ diff --git a/media/libvpx/vp8/common/extend.c b/media/libvpx/vp8/common/extend.c new file mode 100644 index 000000000..2d938ad78 --- /dev/null +++ b/media/libvpx/vp8/common/extend.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "extend.h" +#include "vpx_mem/vpx_mem.h" + + +static void copy_and_extend_plane +( + unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er /* extend right border */ +) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + /* copy the left and right most columns out */ + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = d - el; + dest_ptr2 = d + w; + + for (i = 0; i < h; i++) + { + memset(dest_ptr1, src_ptr1[0], el); + memcpy(dest_ptr1 + el, src_ptr1, w); + memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h) - el; + linesize = el + er + w; + + for (i = 0; i < et; i++) + { + memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += dp; + } + + for (i = 0; i < eb; i++) + { + memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += dp; + } +} + + +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_height, src->y_width, + et, el, eb, er); + + et = dst->border >> 1; + el = dst->border >> 1; + eb = (dst->border >> 1) + dst->uv_height - src->uv_height; + er = (dst->border >> 1) + dst->uv_width - src->uv_width; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); +} + + +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + int src_y_offset = srcy * src->y_stride + srcx; + int dst_y_offset = srcy * dst->y_stride + srcx; + int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + + /* If the side is not touching the bounder then don't extend. */ + if (srcy) + et = 0; + if (srcx) + el = 0; + if (srcy + srch != src->y_height) + eb = 0; + if (srcx + srcw != src->y_width) + er = 0; + + copy_and_extend_plane(src->y_buffer + src_y_offset, + src->y_stride, + dst->y_buffer + dst_y_offset, + dst->y_stride, + srch, srcw, + et, el, eb, er); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + srch = (srch + 1) >> 1; + srcw = (srcw + 1) >> 1; + + copy_and_extend_plane(src->u_buffer + src_uv_offset, + src->uv_stride, + dst->u_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, + src->uv_stride, + dst->v_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); +} + + +/* note the extension is only for the last row, for intra prediction purpose */ +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, + unsigned char *YPtr, + unsigned char *UPtr, + unsigned char *VPtr) +{ + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/media/libvpx/vp8/common/extend.h b/media/libvpx/vp8/common/extend.h new file mode 100644 index 000000000..068f4ac52 --- /dev/null +++ b/media/libvpx/vp8/common/extend.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_EXTEND_H_ +#define VP8_COMMON_EXTEND_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_EXTEND_H_ diff --git a/media/libvpx/vp8/common/filter.c b/media/libvpx/vp8/common/filter.c new file mode 100644 index 000000000..84c608eff --- /dev/null +++ b/media/libvpx/vp8/common/filter.c @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "filter.h" +#include "./vp8_rtcd.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = +{ + + { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0 }, +}; + +static void filter_block2d_first_pass +( + unsigned char *src_ptr, + int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = Temp; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void filter_block2d_second_pass +( + int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + + +static void filter_block2d +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter +) +{ + int FData[9*4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter); +} + + +void vp8_sixtap_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); +} +void vp8_sixtap_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); + +} + +void vp8_sixtap_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); + +} + +void vp8_sixtap_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[21*24]; /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); + +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_stride : Stride of source block. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the horizontal direction to produce the filtered output + * block. Used to implement first-pass of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_first_pass +( + unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_stride, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply bilinear filter */ + dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_stride - width; + dst_ptr += width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 dst_pitch : Destination block pitch. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the vertical direction to produce the filtered output + * block. Used to implement second-pass of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_second_pass +( + unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pitch : Stride of source block. + * UINT32 dst_pitch : Stride of destination block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * INT32 Width : Block width + * INT32 Height : Block height + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +static void filter_block2d_bil +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[17*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; +#if 0 + { + int i; + unsigned char temp1[16]; + unsigned char temp2[16]; + + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + + for (i = 0; i < 16; i++) + { + if (temp1[i] != temp2[i]) + { + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + } + } + } +#endif + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); + +} + +void vp8_bilinear_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); + +} + +void vp8_bilinear_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); + +} + +void vp8_bilinear_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/media/libvpx/vp8/common/filter.h b/media/libvpx/vp8/common/filter.h new file mode 100644 index 000000000..cfba775fc --- /dev/null +++ b/media/libvpx/vp8/common/filter.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_FILTER_H_ +#define VP8_COMMON_FILTER_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]); +extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_FILTER_H_ diff --git a/media/libvpx/vp8/common/findnearmv.c b/media/libvpx/vp8/common/findnearmv.c new file mode 100644 index 000000000..e8ee40f56 --- /dev/null +++ b/media/libvpx/vp8/common/findnearmv.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "findnearmv.h" + +const unsigned char vp8_mbsplit_offset[4][16] = { + { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +}; + +/* Predict motion vectors using those from already-decoded nearby blocks. + Note that we only consider one 4x4 subblock from each candidate 16x16 + macroblock. */ +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, + int_mv *nearby, + int_mv *best_mv, + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[4]; + int_mv *mv = near_mvs; + int *cntx = cnt; + enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV}; + + /* Zero accumulators */ + mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) + { + if (above->mbmi.mv.as_int) + { + (++mv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) + { + if (left->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } + else + cnt[CNT_INTRA] += 2; + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) + { + if (aboveleft->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } + else + cnt[CNT_INTRA] += 1; + } + + /* If we have three distinct MV's ... */ + if (cnt[CNT_SPLITMV]) + { + /* See if above-left MV can be merged with NEAREST */ + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + cnt[CNT_NEAREST] += 1; + } + + cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) + { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = tmp; + } + + /* Use near_mvs[0] to store the "best" MV */ + if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) + near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; + + /* Set up return values */ + best_mv->as_int = near_mvs[0].as_int; + nearest->as_int = near_mvs[CNT_NEAREST].as_int; + nearby->as_int = near_mvs[CNT_NEAR].as_int; +} + + +static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd) +{ + inv->as_mv.row = src->as_mv.row * -1; + inv->as_mv.col = src->as_mv.col * -1; + vp8_clamp_mv2(inv, xd); + vp8_clamp_mv2(src, xd); +} + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + int sign_bias = ref_frame_sign_bias[refframe]; + + vp8_find_near_mvs(xd, + here, + &mode_mv_sb[sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARMV], + &best_mv_sb[sign_bias], + cnt, + refframe, + ref_frame_sign_bias); + + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARESTMV], xd); + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV], + &mode_mv_sb[sign_bias][NEARMV], xd); + invert_and_clamp_mvs(&best_mv_sb[!sign_bias], + &best_mv_sb[sign_bias], xd); + + return sign_bias; +} + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +) +{ + p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0]; + p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1]; + p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2]; + p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3]; + /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/ + return p; +} + diff --git a/media/libvpx/vp8/common/findnearmv.h b/media/libvpx/vp8/common/findnearmv.h new file mode 100644 index 000000000..3c8c0506f --- /dev/null +++ b/media/libvpx/vp8/common/findnearmv.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_FINDNEARMV_H_ +#define VP8_COMMON_FINDNEARMV_H_ + +#include "mv.h" +#include "blockd.h" +#include "modecont.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, + const int *ref_frame_sign_bias) +{ + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) + { + mvp->as_mv.row *= -1; + mvp->as_mv.col *= -1; + } +} + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) +static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) +{ + if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) + mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + + if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) + mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; +} + +static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge, + int mb_to_top_edge, int mb_to_bottom_edge) +{ + mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ? + mb_to_left_edge : mv->as_mv.col; + mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ? + mb_to_right_edge : mv->as_mv.col; + mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ? + mb_to_top_edge : mv->as_mv.row; + mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ? + mb_to_bottom_edge : mv->as_mv.row; +} +static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) +{ + unsigned int need_to_clamp; + need_to_clamp = (mv->as_mv.col < mb_to_left_edge); + need_to_clamp |= (mv->as_mv.col > mb_to_right_edge); + need_to_clamp |= (mv->as_mv.row < mb_to_top_edge); + need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge); + return need_to_clamp; +} + +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, int_mv *nearby, int_mv *best, + int near_mv_ref_cts[4], + int refframe, + int *ref_frame_sign_bias +); + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +); + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +); + +extern const unsigned char vp8_mbsplit_offset[4][16]; + + +static int left_block_mv(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 4; + } + + return (cur_mb->bmi + b - 1)->mv.as_int; +} + +static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 16; + } + + return (cur_mb->bmi + (b - 4))->mv.as_int; +} +static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 3)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 1)->as_mode; +} + +static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 12)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 4)->as_mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_FINDNEARMV_H_ diff --git a/media/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/vp8/common/generic/systemdependent.c new file mode 100644 index 000000000..8ee7e0230 --- /dev/null +++ b/media/libvpx/vp8/common/generic/systemdependent.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#if ARCH_ARM +#include "vpx_ports/arm.h" +#elif ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +#include "vp8/common/onyxc_int.h" +#include "vp8/common/systemdependent.h" + +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H && !defined(__OS2__) +#include <unistd.h> +#elif defined(_WIN32) +#include <windows.h> +#include <intrin.h> +typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); +#elif defined(__OS2__) +#define INCL_DOS +#define INCL_DOSSPINLOCK +#include <os2.h> +#endif +#endif + +#if CONFIG_MULTITHREAD +static int get_cpu_count() +{ + int core_count = 16; + +#if HAVE_UNISTD_H && !defined(__OS2__) +#if defined(_SC_NPROCESSORS_ONLN) + core_count = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); + + core_count = sysinfo.dwNumberOfProcessors; + } +#elif defined(__OS2__) + { + ULONG proc_id; + ULONG status; + + core_count = 0; + for (proc_id = 1; ; proc_id++) + { + if (DosGetProcessorStatus(proc_id, &status)) + break; + + if (status == PROC_ONLINE) + core_count++; + } + } +#else + /* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + +void vp8_clear_system_state_c() {}; + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#endif /* CONFIG_MULTITHREAD */ + +#if ARCH_ARM + ctx->cpu_caps = arm_cpu_caps(); +#elif ARCH_X86 || ARCH_X86_64 + ctx->cpu_caps = x86_simd_caps(); +#endif +} diff --git a/media/libvpx/vp8/common/header.h b/media/libvpx/vp8/common/header.h new file mode 100644 index 000000000..e27bca16b --- /dev/null +++ b/media/libvpx/vp8/common/header.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_HEADER_H_ +#define VP8_COMMON_HEADER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* 24 bits total */ +typedef struct +{ + unsigned int type: 1; + unsigned int version: 3; + unsigned int show_frame: 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes: 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold: 1; + unsigned int uses_gold: 1; + unsigned int update_last: 1; + unsigned int uses_last: 1; +#endif + +} VP8_HEADER; + +#ifdef PACKET_TESTING +#define VP8_HEADER_SIZE 8 +#else +#define VP8_HEADER_SIZE 3 +#endif + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_HEADER_H_ diff --git a/media/libvpx/vp8/common/idct_blk.c b/media/libvpx/vp8/common/idct_blk.c new file mode 100644 index 000000000..8aa7d9bf0 --- /dev/null +++ b/media/libvpx/vp8/common/idct_blk.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); + +void vp8_dequant_idct_add_y_block_c + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dst, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4*stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_c + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstu, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4*stride - 8; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstv, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4*stride - 8; + } +} diff --git a/media/libvpx/vp8/common/idctllm.c b/media/libvpx/vp8/common/idctllm.c new file mode 100644 index 000000000..f5403c5aa --- /dev/null +++ b/media/libvpx/vp8/common/idctllm.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int i; + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } + + ip = output; + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = ip[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int a1 = ((input_dc + 4) >> 3); + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = a1 + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } + +} + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) +{ + short output[16]; + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = (a2 + 3) >> 3; + op[1] = (b2 + 3) >> 3; + op[2] = (c2 + 3) >> 3; + op[3] = (d2 + 3) >> 3; + + ip += 4; + op += 4; + } + + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = output[i]; + } +} + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) +{ + int i; + int a1; + + a1 = ((input[0] + 3) >> 3); + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = a1; + } +} diff --git a/media/libvpx/vp8/common/invtrans.h b/media/libvpx/vp8/common/invtrans.h new file mode 100644 index 000000000..affe57e3d --- /dev/null +++ b/media/libvpx/vp8/common/invtrans.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_INVTRANS_H_ +#define VP8_COMMON_INVTRANS_H_ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "blockd.h" +#include "onyxc_int.h" + +#if CONFIG_MULTITHREAD +#include "vpx_mem/vpx_mem.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static void eob_adjust(char *eobs, short *diff) +{ + /* eob adjust.... the idct can only skip if both the dc and eob are zero */ + int js; + for(js = 0; js < 16; js++) + { + if((eobs[js] == 0) && (diff[0] != 0)) + eobs[js]++; + diff+=16; + } +} + +static void vp8_inverse_transform_mby(MACROBLOCKD *xd) +{ + short *DQC = xd->dequant_y1; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + vp8_short_inv_walsh4x4 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + else + { + vp8_short_inv_walsh4x4_1 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + eob_adjust(xd->eobs, xd->qcoeff); + + DQC = xd->dequant_y1_dc; + } + vp8_dequant_idct_add_y_block + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_INVTRANS_H_ diff --git a/media/libvpx/vp8/common/loopfilter.c b/media/libvpx/vp8/common/loopfilter.c new file mode 100644 index 000000000..756ad488f --- /dev/null +++ b/media/libvpx/vp8/common/loopfilter.c @@ -0,0 +1,661 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "loopfilter.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + + +static void lf_init_lut(loop_filter_info_n *lfi) +{ + int filt_lvl; + + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) + { + if (filt_lvl >= 40) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; + } + else + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; + } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) + { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH); + memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp8_loop_filter_init(VP8_COMMON *cm) +{ + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) + { + memset(lfi->hev_thr[i], i, SIMD_WIDTH); + } +} + +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl) +{ + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != cm->sharpness_level) + { + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) + { + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } + + if (!mbd->mode_ref_lf_delta_enabled) + { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; + } + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + /* clamp */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) + { + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } + } +} + + +void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} + +void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + (void)post_uvstride; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} +void vp8_loop_filter_frame(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int frame_type) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int mb_row; + int mb_col; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + + int filter_level; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + int post_y_stride = post->y_stride; + int post_uv_stride = post->uv_stride; + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, cm->filter_level); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + /* vp8_filter each macro block */ + if (cm->filter_type == NORMAL_LOOPFILTER) + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } + else /* SIMPLE_LOOPFILTER */ + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if (filter_level) + { + const unsigned char * mblim = lfi_n->mblim[filter_level]; + const unsigned char * blim = lfi_n->blim[filter_level]; + + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_y_stride, blim); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_y_stride, blim); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } +} + +void vp8_loop_filter_frame_yonly +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context ++; /* step to next MB */ + + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context ++; /* Skip border mb */ + } + +} + +void vp8_loop_filter_partial_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + int mb_cols = post->y_width >> 4; + int mb_rows = post->y_height >> 4; + + int linestocopy; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + const MODE_INFO *mode_info_context; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl); + + /* number of MB rows to use in partial filtering */ + linestocopy = mb_rows / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Set up the buffer pointers; partial image starts at ~middle of frame */ + y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride; + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = + lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context += 1; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context += 1; /* Skip border mb */ + } +} diff --git a/media/libvpx/vp8/common/loopfilter.h b/media/libvpx/vp8/common/loopfilter.h new file mode 100644 index 000000000..20a6bd375 --- /dev/null +++ b/media/libvpx/vp8/common/loopfilter.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_LOOPFILTER_H_ +#define VP8_COMMON_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vp8_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +/* fraction of total macroblock rows to be used in fast filter level picking */ +/* has to be > 2 */ +#define PARTIAL_FRAME_FRACTION 8 + +typedef enum +{ + NORMAL_LOOPFILTER = 0, + SIMPLE_LOOPFILTER = 1 +} LOOPFILTERTYPE; + +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct +{ + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct loop_filter_info +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; +} loop_filter_info; + + +typedef void loop_filter_uvfunction +( + unsigned char *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v +); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP8Common; +struct macroblockd; +struct modeinfo; + +void vp8_loop_filter_init(struct VP8Common *cm); + +void vp8_loop_filter_frame_init(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd, + int frame_type); + +void vp8_loop_filter_partial_frame(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame_yonly(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +void vp8_loop_filter_row_normal(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); + +void vp8_loop_filter_row_simple(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_LOOPFILTER_H_ diff --git a/media/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/vp8/common/loopfilter_filters.c new file mode 100644 index 000000000..1d51696ff --- /dev/null +++ b/media/libvpx/vp8/common/loopfilter_filters.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <stdlib.h> +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef unsigned char uc; + +static signed char vp8_signed_char_clamp(int t) +{ + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) +{ + signed char mask = 0; + mask |= (abs(p3 - p2) > limit); + mask |= (abs(p2 - p1) > limit); + mask |= (abs(p1 - p0) > limit); + mask |= (abs(q1 - q0) > limit); + mask |= (abs(q2 - q1) > limit); + mask |= (abs(q3 - q2) > limit); + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit); + return mask - 1; +} + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) +{ + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) + +{ + signed char ps0, qs0; + signed char ps1, qs1; + signed char filter_value, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value &= hev; + + /* inner taps */ + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set to adjust by -1 to account for the fact + * we'd round 3 the other way + */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + filter_value = Filter1; + + /* outer tap adjustments */ + filter_value += 1; + filter_value >>= 1; + filter_value &= ~hev; + + u = vp8_signed_char_clamp(qs1 - filter_value); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + filter_value); + *op1 = u ^ 0x80; + +} +void vp8_loop_filter_horizontal_edge_c +( + unsigned char *s, + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } + while (++i < count * 8); +} + +void vp8_loop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } + while (++i < count * 8); +} + +static void vp8_mbfilter(signed char mask, uc hev, + uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) +{ + signed char s, u; + signed char filter_value, Filter1, Filter2; + signed char ps2 = (signed char) * op2 ^ 0x80; + signed char ps1 = (signed char) * op1 ^ 0x80; + signed char ps0 = (signed char) * op0 ^ 0x80; + signed char qs0 = (signed char) * oq0 ^ 0x80; + signed char qs1 = (signed char) * oq1 ^ 0x80; + signed char qs2 = (signed char) * oq2 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + Filter2 = filter_value; + Filter2 &= hev; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + + /* only apply wider filter if not high edge variance */ + filter_value &= ~hev; + Filter2 = filter_value; + + /* roughly 3/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + /* roughly 2/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + /* roughly 1/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + +void vp8_mbloop_filter_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); + + ++s; + } + while (++i < count * 8); + +} + + +void vp8_mbloop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + s += p; + } + while (++i < count * 8); + +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) +{ +/* Why does this cause problems for win32? + * error C2143: syntax error : missing ';' before 'type' + * (void) limit; + */ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; + return mask; +} + +static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1) +{ + signed char filter_value, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + filter_value = vp8_signed_char_clamp(p1 - q1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp8_loop_filter_simple_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } + while (++i < 16); +} + +void vp8_loop_filter_simple_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); + vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); + s += p; + } + while (++i < 16); + +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); +} diff --git a/media/libvpx/vp8/common/mbpitch.c b/media/libvpx/vp8/common/mbpitch.c new file mode 100644 index 000000000..32e1b6640 --- /dev/null +++ b/media/libvpx/vp8/common/mbpitch.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" + +void vp8_setup_block_dptrs(MACROBLOCKD *x) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 25; r++) + { + x->block[r].qcoeff = x->qcoeff + r * 16; + x->block[r].dqcoeff = x->dqcoeff + r * 16; + x->block[r].eob = x->eobs + r; + } +} + +void vp8_build_block_doffsets(MACROBLOCKD *x) +{ + int block; + + for (block = 0; block < 16; block++) /* y blocks */ + { + x->block[block].offset = + (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4; + } + + for (block = 16; block < 20; block++) /* U and V blocks */ + { + x->block[block+4].offset = + x->block[block].offset = + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4; + } +} diff --git a/media/libvpx/vp8/common/mfqe.c b/media/libvpx/vp8/common/mfqe.c new file mode 100644 index 000000000..5c0680f42 --- /dev/null +++ b/media/libvpx/vp8/common/mfqe.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* MFQE: Multiframe Quality Enhancement + * In rate limited situations keyframes may cause significant visual artifacts + * commonly referred to as "popping." This file implements a postproccesing + * algorithm which blends data from the preceeding frame when there is no + * motion and the q from the previous frame is lower which indicates that it is + * higher quality. + */ + +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/postproc.h" +#include "vp8/common/variance.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +#include <limits.h> +#include <stdlib.h> + +static void filter_by_weight(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int block_size, int src_weight) +{ + int dst_weight = (1 << MFQE_PRECISION) - src_weight; + int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; r++) + { + for (c = 0; c < block_size; c++) + { + dst[c] = (src[c] * src_weight + + dst[c] * dst_weight + + rounding_bit) >> MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight); +} + +static void apply_ifactor(unsigned char *y_src, + int y_src_stride, + unsigned char *y_dst, + int y_dst_stride, + unsigned char *u_src, + unsigned char *v_src, + int uv_src_stride, + unsigned char *u_dst, + unsigned char *v_dst, + int uv_dst_stride, + int block_size, + int src_weight) +{ + if (block_size == 16) + { + vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } + else /* if (block_size == 8) */ + { + vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } +} + +static unsigned int int_sqrt(unsigned int x) +{ + unsigned int y = x; + unsigned int guess; + int p = 1; + while (y>>=1) p++; + p>>=1; + + guess=0; + while (p>=0) + { + guess |= (1<<p); + if (x<guess*guess) + guess -= (1<<p); + p--; + } + /* choose between guess or guess+1 */ + return guess+(guess*guess+guess+1<=x); +} + +#define USE_SSD +static void multiframe_quality_enhance_block +( + int blksize, /* Currently only values supported are 16, 8 */ + int qcurr, + int qprev, + unsigned char *y, + unsigned char *u, + unsigned char *v, + int y_stride, + int uv_stride, + unsigned char *yd, + unsigned char *ud, + unsigned char *vd, + int yd_stride, + int uvd_stride +) +{ + static const unsigned char VP8_ZEROS[16]= + { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + }; + int uvblksize = blksize >> 1; + int qdiff = qcurr - qprev; + + int i; + unsigned char *up; + unsigned char *udp; + unsigned char *vp; + unsigned char *vdp; + + unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk; + + if (blksize == 16) + { + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; +#ifdef USE_SSD + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); + sad = (sse + 128)>>8; + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + usad = (sse + 32)>>6; + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vsad = (sse + 32)>>6; +#else + sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; + vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6; +#endif + } + else /* if (blksize == 8) */ + { + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; +#ifdef USE_SSD + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); + sad = (sse + 32)>>6; + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + usad = (sse + 8)>>4; + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vsad = (sse + 8)>>4; +#else + sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; + usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4; + vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4; +#endif + } + + actrisk = (actd > act * 5); + + /* thr = qdiff/16 + log2(act) + log4(qprev) */ + thr = (qdiff >> 4); + while (actd >>= 1) thr++; + while (qprev >>= 2) thr++; + +#ifdef USE_SSD + thrsq = thr * thr; + if (sad < thrsq && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 4 * usad < thrsq && 4 * vsad < thrsq && !actrisk) +#else + if (sad < thr && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 2 * usad < thr && 2 * vsad < thr && !actrisk) +#endif + { + int ifactor; +#ifdef USE_SSD + /* TODO: optimize this later to not need sqr root */ + sad = int_sqrt(sad); +#endif + ifactor = (sad << MFQE_PRECISION) / thr; + ifactor >>= (qdiff >> 5); + + if (ifactor) + { + apply_ifactor(y, y_stride, yd, yd_stride, + u, v, uv_stride, + ud, vd, uvd_stride, + blksize, ifactor); + } + } + else /* else implicitly copy from previous frame */ + { + if (blksize == 16) + { + vp8_copy_mem16x16(y, y_stride, yd, yd_stride); + vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); + vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); + } + else /* if (blksize == 8) */ + { + vp8_copy_mem8x8(y, y_stride, yd, yd_stride); + for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) + memcpy(udp, up, uvblksize); + for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride) + memcpy(vdp, vp, uvblksize); + } + } +} + +static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) +{ + if (mode_info_context->mbmi.mb_skip_coeff) + map[0] = map[1] = map[2] = map[3] = 1; + else if (mode_info_context->mbmi.mode==SPLITMV) + { + static int ndx[4][4] = + { + {0, 1, 4, 5}, + {2, 3, 6, 7}, + {8, 9, 12, 13}, + {10, 11, 14, 15} + }; + int i, j; + for (i=0; i<4; ++i) + { + map[i] = 1; + for (j=0; j<4 && map[j]; ++j) + map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 && + mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2); + } + } + else + { + map[0] = map[1] = map[2] = map[3] = + (mode_info_context->mbmi.mode > B_PRED && + abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 && + abs(mode_info_context->mbmi.mv.as_mv.col) <= 2); + } + return (map[0]+map[1]+map[2]+map[3]); +} + +void vp8_multiframe_quality_enhance +( + VP8_COMMON *cm +) +{ + YV12_BUFFER_CONFIG *show = cm->frame_to_show; + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + + FRAME_TYPE frame_type = cm->frame_type; + /* Point at base of Mb MODE_INFO list has motion vectors etc */ + const MODE_INFO *mode_info_context = cm->show_frame_mi; + int mb_row; + int mb_col; + int totmap, map[4]; + int qcurr = cm->base_qindex; + int qprev = cm->postproc_state.last_base_qindex; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + unsigned char *yd_ptr, *ud_ptr, *vd_ptr; + + /* Set up the buffer pointers */ + y_ptr = show->y_buffer; + u_ptr = show->u_buffer; + v_ptr = show->v_buffer; + yd_ptr = dest->y_buffer; + ud_ptr = dest->u_buffer; + vd_ptr = dest->v_buffer; + + /* postprocess each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + /* if motion is high there will likely be no benefit */ + if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map); + else totmap = (frame_type == KEY_FRAME ? 4 : 0); + if (totmap) + { + if (totmap < 4) + { + int i, j; + for (i=0; i<2; ++i) + for (j=0; j<2; ++j) + { + if (map[i*2+j]) + { + multiframe_quality_enhance_block(8, qcurr, qprev, + y_ptr + 8*(i*show->y_stride+j), + u_ptr + 4*(i*show->uv_stride+j), + v_ptr + 4*(i*show->uv_stride+j), + show->y_stride, + show->uv_stride, + yd_ptr + 8*(i*dest->y_stride+j), + ud_ptr + 4*(i*dest->uv_stride+j), + vd_ptr + 4*(i*dest->uv_stride+j), + dest->y_stride, + dest->uv_stride); + } + else + { + /* copy a 8x8 block */ + int k; + unsigned char *up = u_ptr + 4*(i*show->uv_stride+j); + unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j); + unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j); + unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j); + vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride, + yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride); + for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride, + vp += show->uv_stride, vdp += dest->uv_stride) + { + memcpy(udp, up, 4); + memcpy(vdp, vp, 4); + } + } + } + } + else /* totmap = 4 */ + { + multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr, + u_ptr, v_ptr, + show->y_stride, + show->uv_stride, + yd_ptr, ud_ptr, vd_ptr, + dest->y_stride, + dest->uv_stride); + } + } + else + { + vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride); + vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride); + vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride); + } + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + yd_ptr += 16; + ud_ptr += 8; + vd_ptr += 8; + mode_info_context++; /* step to next MB */ + } + + y_ptr += show->y_stride * 16 - 16 * cm->mb_cols; + u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols; + ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + + mode_info_context++; /* Skip border mb */ + } +} diff --git a/media/libvpx/vp8/common/modecont.c b/media/libvpx/vp8/common/modecont.c new file mode 100644 index 000000000..86a74bc0f --- /dev/null +++ b/media/libvpx/vp8/common/modecont.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropy.h" + +const int vp8_mode_contexts[6][4] = +{ + { + /* 0 */ + 7, 1, 1, 143, + }, + { + /* 1 */ + 14, 18, 14, 107, + }, + { + /* 2 */ + 135, 64, 57, 68, + }, + { + /* 3 */ + 60, 56, 128, 65, + }, + { + /* 4 */ + 159, 134, 128, 34, + }, + { + /* 5 */ + 234, 188, 128, 28, + }, +}; diff --git a/media/libvpx/vp8/common/modecont.h b/media/libvpx/vp8/common/modecont.h new file mode 100644 index 000000000..ff34c33c5 --- /dev/null +++ b/media/libvpx/vp8/common/modecont.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_MODECONT_H_ +#define VP8_COMMON_MODECONT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern const int vp8_mode_contexts[6][4]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_MODECONT_H_ diff --git a/media/libvpx/vp8/common/mv.h b/media/libvpx/vp8/common/mv.h new file mode 100644 index 000000000..111ccd63c --- /dev/null +++ b/media/libvpx/vp8/common/mv.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_MV_H_ +#define VP8_COMMON_MV_H_ +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct +{ + short row; + short col; +} MV; + +typedef union int_mv +{ + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_MV_H_ diff --git a/media/libvpx/vp8/common/onyx.h b/media/libvpx/vp8/common/onyx.h new file mode 100644 index 000000000..f39b675cd --- /dev/null +++ b/media/libvpx/vp8/common/onyx.h @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ONYX_H_ +#define VP8_COMMON_ONYX_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vpx_scale/yv12config.h" +#include "ppflags.h" + + struct VP8_COMP; + + /* Create/destroy static data structures. */ + + typedef enum + { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 + + } VPX_SCALING; + + typedef enum + { + USAGE_LOCAL_FILE_PLAYBACK = 0x0, + USAGE_STREAM_FROM_SERVER = 0x1, + USAGE_CONSTRAINED_QUALITY = 0x2, + USAGE_CONSTANT_QUALITY = 0x3 + } END_USAGE; + + + typedef enum + { + MODE_REALTIME = 0x0, + MODE_GOODQUALITY = 0x1, + MODE_BESTQUALITY = 0x2, + MODE_FIRSTPASS = 0x3, + MODE_SECONDPASS = 0x4, + MODE_SECONDPASS_BEST = 0x5 + } MODE; + + typedef enum + { + FRAMEFLAGS_KEY = 1, + FRAMEFLAGS_GOLDEN = 2, + FRAMEFLAGS_ALTREF = 4 + } FRAMETYPE_FLAGS; + + +#include <assert.h> + static void Scale2Ratio(int mode, int *hr, int *hs) + { + switch (mode) + { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } + } + + typedef struct + { + /* 4 versions of bitstream defined: + * 0 best quality/slowest decode, 3 lowest quality/fastest decode + */ + int Version; + int Width; + int Height; + struct vpx_rational timebase; + unsigned int target_bandwidth; /* kilobits per second */ + + /* Parameter used for applying denoiser. + * For temporal denoiser: noise_sensitivity = 0 means off, + * noise_sensitivity = 1 means temporal denoiser on for Y channel only, + * noise_sensitivity = 2 means temporal denoiser on for all channels. + * noise_sensitivity = 3 means aggressive denoising mode. + * noise_sensitivity >= 4 means adaptive denoising mode. + * Temporal denoiser is enabled via the configuration option: + * CONFIG_TEMPORAL_DENOISING. + * For spatial denoiser: noise_sensitivity controls the amount of + * pre-processing blur: noise_sensitivity = 0 means off. + * Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING. + */ + int noise_sensitivity; + + /* parameter used for sharpening output: recommendation 0: */ + int Sharpness; + int cpu_used; + unsigned int rc_max_intra_bitrate_pct; + unsigned int screen_content_mode; + + /* mode -> + *(0)=Realtime/Live Encoding. This mode is optimized for realtim + * encoding (for example, capturing a television signal or feed + * from a live camera). ( speed setting controls how fast ) + *(1)=Good Quality Fast Encoding. The encoder balances quality with + * the amount of time it takes to encode the output. ( speed + * setting controls how fast ) + *(2)=One Pass - Best Quality. The encoder places priority on the + * quality of the output over encoding speed. The output is + * compressed at the highest possible quality. This option takes + * the longest amount of time to encode. ( speed setting ignored + * ) + *(3)=Two Pass - First Pass. The encoder generates a file of + * statistics for use in the second encoding pass. ( speed + * setting controls how fast ) + *(4)=Two Pass - Second Pass. The encoder uses the statistics that + * were generated in the first encoding pass to create the + * compressed output. ( speed setting controls how fast ) + *(5)=Two Pass - Second Pass Best. The encoder uses the statistics + * that were generated in the first encoding pass to create the + * compressed output using the highest possible quality, and + * taking a longer amount of time to encode.. ( speed setting + * ignored ) + */ + int Mode; + + /* Key Framing Operations */ + int auto_key; /* automatically detect cut scenes */ + int key_freq; /* maximum distance to key frame. */ + + /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */ + int allow_lag; + int lag_in_frames; /* how many frames lag before we start encoding */ + + /* + * DATARATE CONTROL OPTIONS + */ + + int end_usage; /* vbr or cbr */ + + /* buffer targeting aggressiveness */ + int under_shoot_pct; + int over_shoot_pct; + + /* buffering parameters */ + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + int64_t starting_buffer_level_in_ms; + int64_t optimal_buffer_level_in_ms; + int64_t maximum_buffer_size_in_ms; + + /* controlling quality */ + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + + /* allow internal resizing */ + int allow_spatial_resampling; + int resample_down_water_mark; + int resample_up_water_mark; + + /* allow internal frame rate alterations */ + int allow_df; + int drop_frames_water_mark; + + /* two pass datarate control */ + int two_pass_vbrbias; + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + + /* + * END DATARATE CONTROL OPTIONS + */ + + /* these parameters aren't to be used in final build don't use!!! */ + int play_alternate; + int alt_freq; + int alt_q; + int key_q; + int gold_q; + + + int multi_threaded; /* how many threads to run the encoder on */ + int token_partitions; /* how many token partitions to create */ + + /* early breakout threshold: for video conf recommend 800 */ + int encode_breakout; + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + int arnr_max_frames; + int arnr_strength; + int arnr_type; + + vpx_fixed_buf_t two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + + vp8e_tuning tuning; + + /* Temporal scaling parameters */ + unsigned int number_of_layers; + unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY]; + unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY]; + unsigned int periodicity; + unsigned int layer_id[VPX_TS_MAX_PERIODICITY]; + +#if CONFIG_MULTI_RES_ENCODING + /* Number of total resolutions encoded */ + unsigned int mr_total_resolutions; + + /* Current encoder ID */ + unsigned int mr_encoder_id; + + /* Down-sampling factor */ + vpx_rational_t mr_down_sampling_factor; + + /* Memory location to store low-resolution encoder's mode info */ + void* mr_low_res_mode_info; +#endif + } VP8_CONFIG; + + + void vp8_initialize(); + + struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf); + void vp8_remove_compressor(struct VP8_COMP* *comp); + + void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf); + void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf); + + int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); + int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); + int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); + + int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags); + int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags); + int vp8_get_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_set_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp8_update_entropy(struct VP8_COMP* comp, int update); + int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); + int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols); + int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + int vp8_get_quantizer(struct VP8_COMP* c); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_COMMON_ONYX_H_ diff --git a/media/libvpx/vp8/common/onyxc_int.h b/media/libvpx/vp8/common/onyxc_int.h new file mode 100644 index 000000000..6d89865c6 --- /dev/null +++ b/media/libvpx/vp8/common/onyxc_int.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ONYXC_INT_H_ +#define VP8_COMMON_ONYXC_INT_H_ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "loopfilter.h" +#include "entropymv.h" +#include "entropy.h" +#if CONFIG_POSTPROC +#include "postproc.h" +#endif + +/*#ifdef PACKET_TESTING*/ +#include "header.h" +/*#endif*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 127 +#define QINDEX_RANGE (MAXQ + 1) + +#define NUM_YV12_BUFFERS 4 + +#define MAX_PARTITIONS 9 + +typedef struct frame_contexts +{ + vp8_prob bmode_prob [VP8_BINTRAMODES-1]; + vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */ + vp8_prob uv_mode_prob [VP8_UV_MODES-1]; + vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1]; + vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + MV_CONTEXT mvc[2]; +} FRAME_CONTEXT; + +typedef enum +{ + ONE_PARTITION = 0, + TWO_PARTITION = 1, + FOUR_PARTITION = 2, + EIGHT_PARTITION = 3 +} TOKEN_PARTITION; + +typedef enum +{ + RECON_CLAMP_REQUIRED = 0, + RECON_CLAMP_NOTREQUIRED = 1 +} CLAMP_TYPE; + +typedef struct VP8Common + +{ + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]); + + int Width; + int Height; + int horiz_scale; + int vert_scale; + + CLAMP_TYPE clamp_type; + + YV12_BUFFER_CONFIG *frame_to_show; + + YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; + int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; + + YV12_BUFFER_CONFIG temp_scale_frame; + +#if CONFIG_POSTPROC + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; + int post_proc_buffer_int_used; + unsigned char *pp_limits_buffer; /* post-processing filter coefficients */ +#endif + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE frame_type; + + int show_frame; + + int frame_flags; + int MBs; + int mb_rows; + int mb_cols; + int mode_info_stride; + + /* profile settings */ + int mb_no_coeff_skip; + int no_lpf; + int use_bilinear_mc_filter; + int full_pixel; + + int base_qindex; + + int y1dc_delta_q; + int y2dc_delta_q; + int y2ac_delta_q; + int uvdc_delta_q; + int uvac_delta_q; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ +#if CONFIG_ERROR_CONCEALMENT + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ +#endif + MODE_INFO *show_frame_mi; /* MODE_INFO for the last decoded frame + to show */ + LOOPFILTERTYPE filter_type; + + loop_filter_info_n lf_info; + + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ + + int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ + int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ + + int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ + ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */ + + FRAME_CONTEXT lfc; /* last frame entropy */ + FRAME_CONTEXT fc; /* this frame entropy */ + + unsigned int current_video_frame; + + int version; + + TOKEN_PARTITION multi_token_partition; + +#ifdef PACKET_TESTING + VP8_HEADER oh; +#endif +#if CONFIG_POSTPROC_VISUALIZER + double bitrate; + double framerate; +#endif + +#if CONFIG_MULTITHREAD + int processor_core_count; +#endif +#if CONFIG_POSTPROC + struct postproc_state postproc_state; +#endif + int cpu_caps; +} VP8_COMMON; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ONYXC_INT_H_ diff --git a/media/libvpx/vp8/common/onyxd.h b/media/libvpx/vp8/common/onyxd.h new file mode 100644 index 000000000..e37b29f32 --- /dev/null +++ b/media/libvpx/vp8/common/onyxd.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ONYXD_H_ +#define VP8_COMMON_ONYXD_H_ + + +/* Create/destroy static data structures. */ +#ifdef __cplusplus +extern "C" +{ +#endif +#include "vpx_scale/yv12config.h" +#include "ppflags.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_codec.h" +#include "vpx/vp8.h" + + struct VP8D_COMP; + + typedef struct + { + int Width; + int Height; + int Version; + int postprocess; + int max_threads; + int error_concealment; + } VP8D_CONFIG; + + typedef enum + { + VP8D_OK = 0 + } VP8D_SETTING; + + void vp8dx_initialize(void); + + void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x); + + int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst); + + int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, + size_t size, const uint8_t *dest, + int64_t time_stamp); + int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); + + vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + +#ifdef __cplusplus +} +#endif + + +#endif // VP8_COMMON_ONYXD_H_ diff --git a/media/libvpx/vp8/common/postproc.c b/media/libvpx/vp8/common/postproc.c new file mode 100644 index 000000000..a4e6ae170 --- /dev/null +++ b/media/libvpx/vp8/common/postproc.c @@ -0,0 +1,1206 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_scale_rtcd.h" +#include "vpx_scale/yv12config.h" +#include "postproc.h" +#include "common.h" +#include "vpx_scale/vpx_scale.h" +#include "systemdependent.h" + +#include <limits.h> +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#define RGB_TO_YUV(t) \ + ( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16), \ + (-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \ + ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128) + +/* global constants */ +#if CONFIG_POSTPROC_VISUALIZER +static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = +{ + { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ + { RGB_TO_YUV(0x00FF00) }, /* Green */ + { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ + { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ + { RGB_TO_YUV(0x006400) }, /* DarkGreen */ + { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ + { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ + { RGB_TO_YUV(0x00008B) }, /* Dark blue */ + { RGB_TO_YUV(0x551A8B) }, /* Purple */ + { RGB_TO_YUV(0xFF0000) } /* Red */ +}; + +static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = +{ + { RGB_TO_YUV(0x6633ff) }, /* Purple */ + { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ + { RGB_TO_YUV(0xff33cc) }, /* Pink */ + { RGB_TO_YUV(0xff3366) }, /* Coral */ + { RGB_TO_YUV(0x3366ff) }, /* Blue */ + { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ + { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ + { RGB_TO_YUV(0xff6633) }, /* Orange */ + { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ + { RGB_TO_YUV(0x8ab800) }, /* Green */ + { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ + { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ + { RGB_TO_YUV(0x66ff33) }, /* Light Green */ + { RGB_TO_YUV(0xccff33) }, /* Yellow */ +}; + +static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = +{ + { RGB_TO_YUV(0x00ff00) }, /* Blue */ + { RGB_TO_YUV(0x0000ff) }, /* Green */ + { RGB_TO_YUV(0xffff00) }, /* Yellow */ + { RGB_TO_YUV(0xff0000) }, /* Red */ +}; +#endif + +const short vp8_rv[] = +{ + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, + 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, + 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, + 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, + 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, + 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, + 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, + 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, + 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, + 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, + 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, + 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, + 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, + 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, + 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, +}; + +extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch); +extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch); +/*********************************************************************************************************** + */ +void vp8_post_proc_down_and_across_mb_row_c +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int cols, + unsigned char *f, + int size +) +{ + unsigned char *p_src, *p_dst; + int row; + int col; + unsigned char v; + unsigned char d[4]; + + for (row = 0; row < size; row++) + { + /* post_proc_down for one row */ + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) + { + unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; + unsigned char p_above1 = p_src[col - src_pixels_per_line]; + unsigned char p_below1 = p_src[col + src_pixels_per_line]; + unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; + + v = p_src[col]; + + if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) + && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) + { + unsigned char k1, k2, k3; + k1 = (p_above2 + p_above1 + 1) >> 1; + k2 = (p_below2 + p_below1 + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst_ptr; + p_dst = dst_ptr; + + p_src[-2] = p_src[-1] = p_src[0]; + p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; + + for (col = 0; col < cols; col++) + { + v = p_src[col]; + + if ((abs(v - p_src[col - 2]) < f[col]) + && (abs(v - p_src[col - 1]) < f[col]) + && (abs(v - p_src[col + 1]) < f[col]) + && (abs(v - p_src[col + 2]) < f[col])) + { + unsigned char k1, k2, k3; + k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; + k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + d[col & 3] = v; + + if (col >= 2) + p_dst[col - 2] = d[(col - 2) & 3]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 3]; + p_dst[col - 1] = d[(col - 1) & 3]; + + /* next row */ + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +static int q2mbl(int x) +{ + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit) +{ + int r, c, i; + + unsigned char *s = src; + unsigned char d[16]; + + for (r = 0; r < rows; r++) + { + int sumsq = 0; + int sum = 0; + + for (i = -8; i < 0; i++) + s[i]=s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) + s[i+cols]=s[cols-1]; + + for (i = -8; i <= 6; i++) + { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i+8] = 0; + } + + for (c = 0; c < cols + 8; c++) + { + int x = s[c+7] - s[c-8]; + int y = s[c+7] + s[c-8]; + + sum += x; + sumsq += x * y; + + d[c&15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) + { + d[c&15] = (8 + sum + s[c]) >> 4; + } + + s[c-8] = d[(c-8)&15]; + } + + s += pitch; + } +} + +void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) +{ + int r, c, i; + const short *rv3 = &vp8_rv[63&rand()]; + + for (c = 0; c < cols; c++ ) + { + unsigned char *s = &dst[c]; + int sumsq = 0; + int sum = 0; + unsigned char d[16]; + const short *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i < 0; i++) + s[i*pitch]=s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) + s[(i+rows)*pitch]=s[(rows-1)*pitch]; + + for (i = -8; i <= 6; i++) + { + sumsq += s[i*pitch] * s[i*pitch]; + sum += s[i*pitch]; + } + + for (r = 0; r < rows + 8; r++) + { + sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch]; + sum += s[7*pitch] - s[-8*pitch]; + d[r&15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) + { + d[r&15] = (rv2[r&127] + sum + s[0]) >> 4; + } + if (r >= 8) + s[-8*pitch] = d[(r-8)&15]; + s += pitch; + } + } +} + +#if CONFIG_POSTPROC +static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, + int q) +{ + vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); +} + +void vp8_deblock(VP8_COMMON *cm, + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag) +{ + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + + const MODE_INFO *mode_info_context = cm->show_frame_mi; + int mbr, mbc; + + /* The pixel thresholds are adjusted according to if or not the macroblock + * is a skipped block. */ + unsigned char *ylimits = cm->pp_limits_buffer; + unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols; + (void) low_var_thresh; + (void) flag; + + if (ppl > 0) + { + for (mbr = 0; mbr < cm->mb_rows; mbr++) + { + unsigned char *ylptr = ylimits; + unsigned char *uvlptr = uvlimits; + for (mbc = 0; mbc < cm->mb_cols; mbc++) + { + unsigned char mb_ppl; + + if (mode_info_context->mbmi.mb_skip_coeff) + mb_ppl = (unsigned char)ppl >> 1; + else + mb_ppl = (unsigned char)ppl; + + memset(ylptr, mb_ppl, 16); + memset(uvlptr, mb_ppl, 8); + + ylptr += 16; + uvlptr += 8; + mode_info_context++; + } + mode_info_context++; + + vp8_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + post->y_buffer + 16 * mbr * post->y_stride, source->y_stride, + post->y_stride, source->y_width, ylimits, 16); + + vp8_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + vp8_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + } + } else + { + vp8_yv12_copy_frame(source, post); + } +} +#endif + +void vp8_de_noise(VP8_COMMON *cm, + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + int uvfilter) +{ + int mbr; + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + unsigned char *limits = cm->pp_limits_buffer;; + (void) post; + (void) low_var_thresh; + (void) flag; + + memset(limits, (unsigned char)ppl, 16 * mb_cols); + + /* TODO: The original code don't filter the 2 outer rows and columns. */ + for (mbr = 0; mbr < mb_rows; mbr++) + { + vp8_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + source->y_buffer + 16 * mbr * source->y_stride, + source->y_stride, source->y_stride, source->y_width, limits, 16); + if (uvfilter == 1) { + vp8_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + source->u_buffer + 8 * mbr * source->uv_stride, + source->uv_stride, source->uv_stride, source->uv_width, limits, + 8); + vp8_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + source->v_buffer + 8 * mbr * source->uv_stride, + source->uv_stride, source->uv_stride, source->uv_width, limits, + 8); + } + } +} + +static double gaussian(double sigma, double mu, double x) +{ + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +static void fillrd(struct postproc_state *state, int q, int a) +{ + char char_dist[300]; + + double sigma; + int i; + + vp8_clear_system_state(); + + + sigma = a + .5 + .6 * (63 - q) / 63.0; + + /* set up a lookup table of 256 entries that matches + * a gaussian distribution with sigma determined by q. + */ + { + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) + { + const int v = (int)(.5 + 256 * gaussian(sigma, 0, i)); + + if (v) + { + for (j = 0; j < v; j++) + { + char_dist[next+j] = (char) i; + } + + next = next + j; + } + + } + + for (; next < 256; next++) + char_dist[next] = 0; + + } + + for (i = 0; i < 3072; i++) + { + state->noise[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) + { + state->blackclamp[i] = -char_dist[0]; + state->whiteclamp[i] = -char_dist[0]; + state->bothclamp[i] = -2 * char_dist[0]; + } + + state->last_q = q; + state->last_noise = a; +} + +/**************************************************************************** + * + * ROUTINE : plane_add_noise_c + * + * INPUTS : unsigned char *Start starting address of buffer to add gaussian + * noise to + * unsigned int Width width of plane + * unsigned int Height height of plane + * int Pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_plane_add_noise_c(unsigned char *Start, char *noise, + char blackclamp[16], + char whiteclamp[16], + char bothclamp[16], + unsigned int Width, unsigned int Height, int Pitch) +{ + unsigned int i, j; + (void)bothclamp; + + for (i = 0; i < Height; i++) + { + unsigned char *Pos = Start + i * Pitch; + char *Ref = (char *)(noise + (rand() & 0xff)); + + for (j = 0; j < Width; j++) + { + if (Pos[j] < blackclamp[0]) + Pos[j] = blackclamp[0]; + + if (Pos[j] > 255 + whiteclamp[0]) + Pos[j] = 255 + whiteclamp[0]; + + Pos[j] += Ref[j]; + } + } +} + +/* Blend the macro block with a solid colored square. Leave the + * edges unblended to give distinction to macro blocks in areas + * filled with the same color block. + */ +void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v, + int y_1, int u_1, int v_1, int alpha, int stride) +{ + int i, j; + int y1_const = y_1*((1<<16)-alpha); + int u1_const = u_1*((1<<16)-alpha); + int v1_const = v_1*((1<<16)-alpha); + + y += 2*stride + 2; + for (i = 0; i < 12; i++) + { + for (j = 0; j < 12; j++) + { + y[j] = (y[j]*alpha + y1_const)>>16; + } + y += stride; + } + + stride >>= 1; + + u += stride + 1; + v += stride + 1; + + for (i = 0; i < 6; i++) + { + for (j = 0; j < 6; j++) + { + u[j] = (u[j]*alpha + u1_const)>>16; + v[j] = (v[j]*alpha + v1_const)>>16; + } + u += stride; + v += stride; + } +} + +/* Blend only the edge of the macro block. Leave center + * unblended to allow for other visualizations to be layered. + */ +void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v, + int y_1, int u_1, int v_1, int alpha, int stride) +{ + int i, j; + int y1_const = y_1*((1<<16)-alpha); + int u1_const = u_1*((1<<16)-alpha); + int v1_const = v_1*((1<<16)-alpha); + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 16; j++) + { + y[j] = (y[j]*alpha + y1_const)>>16; + } + y += stride; + } + + for (i = 0; i < 12; i++) + { + y[0] = (y[0]*alpha + y1_const)>>16; + y[1] = (y[1]*alpha + y1_const)>>16; + y[14] = (y[14]*alpha + y1_const)>>16; + y[15] = (y[15]*alpha + y1_const)>>16; + y += stride; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 16; j++) + { + y[j] = (y[j]*alpha + y1_const)>>16; + } + y += stride; + } + + stride >>= 1; + + for (j = 0; j < 8; j++) + { + u[j] = (u[j]*alpha + u1_const)>>16; + v[j] = (v[j]*alpha + v1_const)>>16; + } + u += stride; + v += stride; + + for (i = 0; i < 6; i++) + { + u[0] = (u[0]*alpha + u1_const)>>16; + v[0] = (v[0]*alpha + v1_const)>>16; + + u[7] = (u[7]*alpha + u1_const)>>16; + v[7] = (v[7]*alpha + v1_const)>>16; + + u += stride; + v += stride; + } + + for (j = 0; j < 8; j++) + { + u[j] = (u[j]*alpha + u1_const)>>16; + v[j] = (v[j]*alpha + v1_const)>>16; + } +} + +void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v, + int y_1, int u_1, int v_1, int alpha, int stride) +{ + int i, j; + int y1_const = y_1*((1<<16)-alpha); + int u1_const = u_1*((1<<16)-alpha); + int v1_const = v_1*((1<<16)-alpha); + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + y[j] = (y[j]*alpha + y1_const)>>16; + } + y += stride; + } + + stride >>= 1; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + u[j] = (u[j]*alpha + u1_const)>>16; + v[j] = (v[j]*alpha + v1_const)>>16; + } + u += stride; + v += stride; + } +} + +static void constrain_line (int x_0, int *x_1, int y_0, int *y_1, int width, int height) +{ + int dx; + int dy; + + if (*x_1 > width) + { + dx = *x_1 - x_0; + dy = *y_1 - y_0; + + *x_1 = width; + if (dx) + *y_1 = ((width-x_0)*dy)/dx + y_0; + } + if (*x_1 < 0) + { + dx = *x_1 - x_0; + dy = *y_1 - y_0; + + *x_1 = 0; + if (dx) + *y_1 = ((0-x_0)*dy)/dx + y_0; + } + if (*y_1 > height) + { + dx = *x_1 - x_0; + dy = *y_1 - y_0; + + *y_1 = height; + if (dy) + *x_1 = ((height-y_0)*dx)/dy + x_0; + } + if (*y_1 < 0) + { + dx = *x_1 - x_0; + dy = *y_1 - y_0; + + *y_1 = 0; + if (dy) + *x_1 = ((0-y_0)*dx)/dy + x_0; + } +} + +#if CONFIG_POSTPROC +int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) +{ + int q = oci->filter_level * 10 / 6; + int flags = ppflags->post_proc_flag; + int deblock_level = ppflags->deblocking_level; + int noise_level = ppflags->noise_level; + + if (!oci->frame_to_show) + return -1; + + if (q > 63) + q = 63; + + if (!flags) + { + *dest = *oci->frame_to_show; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + oci->postproc_state.last_base_qindex = oci->base_qindex; + oci->postproc_state.last_frame_valid = 1; + return 0; + } + + /* Allocate post_proc_buffer_int if needed */ + if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used) + { + if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) + { + int width = (oci->Width + 15) & ~15; + int height = (oci->Height + 15) & ~15; + + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, + width, height, VP8BORDERINPIXELS)) + vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate MFQE framebuffer"); + + oci->post_proc_buffer_int_used = 1; + + /* insure that postproc is set to all 0's so that post proc + * doesn't pull random data in from edge + */ + memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size); + + } + } + + vp8_clear_system_state(); + + if ((flags & VP8D_MFQE) && + oci->postproc_state.last_frame_valid && + oci->current_video_frame >= 2 && + oci->postproc_state.last_base_qindex < 60 && + oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) + { + vp8_multiframe_quality_enhance(oci); + if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) && + oci->post_proc_buffer_int_used) + { + vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int); + if (flags & VP8D_DEMACROBLOCK) + { + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0); + vp8_de_mblock(&oci->post_proc_buffer, + q + (deblock_level - 5) * 10); + } + else if (flags & VP8D_DEBLOCK) + { + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, + q, 1, 0); + } + } + /* Move partially towards the base q of the previous frame */ + oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2; + } + else if (flags & VP8D_DEMACROBLOCK) + { + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0); + vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); + + oci->postproc_state.last_base_qindex = oci->base_qindex; + } + else if (flags & VP8D_DEBLOCK) + { + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, + q, 1, 0); + oci->postproc_state.last_base_qindex = oci->base_qindex; + } + else + { + vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); + oci->postproc_state.last_base_qindex = oci->base_qindex; + } + oci->postproc_state.last_frame_valid = 1; + + if (flags & VP8D_ADDNOISE) + { + if (oci->postproc_state.last_q != q + || oci->postproc_state.last_noise != noise_level) + { + fillrd(&oci->postproc_state, 63 - q, noise_level); + } + + vp8_plane_add_noise + (oci->post_proc_buffer.y_buffer, + oci->postproc_state.noise, + oci->postproc_state.blackclamp, + oci->postproc_state.whiteclamp, + oci->postproc_state.bothclamp, + oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height, + oci->post_proc_buffer.y_stride); + } + +#if CONFIG_POSTPROC_VISUALIZER + if (flags & VP8D_DEBUG_TXT_FRAME_INFO) + { + char message[512]; + sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (oci->frame_type == KEY_FRAME), + oci->refresh_golden_frame, + oci->base_qindex, + oci->filter_level, + flags, + oci->mb_cols, oci->mb_rows); + vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); + } + + if (flags & VP8D_DEBUG_TXT_MBLK_MODES) + { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp8_filter each macro block */ + for (i = 0; i < mb_rows; i++) + { + for (j = 0; j < mb_cols; j++) + { + char zz[4]; + + sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + + vp8_blit_text(zz, y_ptr, post->y_stride); + mb_index ++; + y_ptr += 16; + } + + mb_index ++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP8D_DEBUG_TXT_DC_DIFF) + { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp8_filter each macro block */ + for (i = 0; i < mb_rows; i++) + { + for (j = 0; j < mb_cols; j++) + { + char zz[4]; + int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED && + mi[mb_index].mbmi.mode != SPLITMV && + mi[mb_index].mbmi.mb_skip_coeff); + + if (oci->frame_type == KEY_FRAME) + sprintf(zz, "a"); + else + sprintf(zz, "%c", dc_diff + '0'); + + vp8_blit_text(zz, y_ptr, post->y_stride); + mb_index ++; + y_ptr += 16; + } + + mb_index ++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP8D_DEBUG_TXT_RATE_INFO) + { + char message[512]; + sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate); + vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); + } + + /* Draw motion vectors */ + if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) + { + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_buffer = oci->post_proc_buffer.y_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + int x0, y0; + + for (y0 = 0; y0 < height; y0 += 16) + { + for (x0 = 0; x0 < width; x0 += 16) + { + int x1, y1; + + if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode))) + { + mi++; + continue; + } + + if (mi->mbmi.mode == SPLITMV) + { + switch (mi->mbmi.partitioning) + { + case 0 : /* mv_top_bottom */ + { + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line (x0+8, &x1, y0+4, &y1, width, height); + vp8_blit_line (x0+8, x1, y0+4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 +12 + (mv->row >> 3); + + constrain_line (x0+8, &x1, y0+12, &y1, width, height); + vp8_blit_line (x0+8, x1, y0+12, y1, y_buffer, y_stride); + + break; + } + case 1 : /* mv_left_right */ + { + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line (x0+4, &x1, y0+8, &y1, width, height); + vp8_blit_line (x0+4, x1, y0+8, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 +12 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line (x0+12, &x1, y0+8, &y1, width, height); + vp8_blit_line (x0+12, x1, y0+8, y1, y_buffer, y_stride); + + break; + } + case 2 : /* mv_quarters */ + { + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line (x0+4, &x1, y0+4, &y1, width, height); + vp8_blit_line (x0+4, x1, y0+4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 +12 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line (x0+12, &x1, y0+4, &y1, width, height); + vp8_blit_line (x0+12, x1, y0+4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 +12 + (mv->row >> 3); + + constrain_line (x0+4, &x1, y0+12, &y1, width, height); + vp8_blit_line (x0+4, x1, y0+12, y1, y_buffer, y_stride); + + bmi = &mi->bmi[10]; + + x1 = x0 +12 + (mv->col >> 3); + y1 = y0 +12 + (mv->row >> 3); + + constrain_line (x0+12, &x1, y0+12, &y1, width, height); + vp8_blit_line (x0+12, x1, y0+12, y1, y_buffer, y_stride); + break; + } + default : + { + union b_mode_info *bmi = mi->bmi; + int bx0, by0; + + for (by0 = y0; by0 < (y0+16); by0 += 4) + { + for (bx0 = x0; bx0 < (x0+16); bx0 += 4) + { + MV *mv = &bmi->mv.as_mv; + + x1 = bx0 + 2 + (mv->col >> 3); + y1 = by0 + 2 + (mv->row >> 3); + + constrain_line (bx0+2, &x1, by0+2, &y1, width, height); + vp8_blit_line (bx0+2, x1, by0+2, y1, y_buffer, y_stride); + + bmi++; + } + } + } + } + } + else if (mi->mbmi.mode >= NEARESTMV) + { + MV *mv = &mi->mbmi.mv.as_mv; + const int lx0 = x0 + 8; + const int ly0 = y0 + 8; + + x1 = lx0 + (mv->col >> 3); + y1 = ly0 + (mv->row >> 3); + + if (x1 != lx0 && y1 != ly0) + { + constrain_line (lx0, &x1, ly0-1, &y1, width, height); + vp8_blit_line (lx0, x1, ly0-1, y1, y_buffer, y_stride); + + constrain_line (lx0, &x1, ly0+1, &y1, width, height); + vp8_blit_line (lx0, x1, ly0+1, y1, y_buffer, y_stride); + } + else + vp8_blit_line (lx0, x1, ly0, y1, y_buffer, y_stride); + } + + mi++; + } + mi++; + } + } + + /* Color in block modes */ + if ((flags & VP8D_DEBUG_CLR_BLK_MODES) + && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) + { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; + unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; + unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) + { + for (x = 0; x < width; x += 16) + { + int Y = 0, U = 0, V = 0; + + if (mi->mbmi.mode == B_PRED && + ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag)) + { + int by, bx; + unsigned char *yl, *ul, *vl; + union b_mode_info *bmi = mi->bmi; + + yl = y_ptr + x; + ul = u_ptr + (x>>1); + vl = v_ptr + (x>>1); + + for (by = 0; by < 16; by += 4) + { + for (bx = 0; bx < 16; bx += 4) + { + if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode)) + || (ppflags->display_mb_modes_flag & B_PRED)) + { + Y = B_PREDICTION_MODE_colors[bmi->as_mode][0]; + U = B_PREDICTION_MODE_colors[bmi->as_mode][1]; + V = B_PREDICTION_MODE_colors[bmi->as_mode][2]; + + vp8_blend_b + (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride); + } + bmi++; + } + + yl += y_stride*4; + ul += y_stride*1; + vl += y_stride*1; + } + } + else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode)) + { + Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; + U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; + V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; + + vp8_blend_mb_inner + (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride*16; + u_ptr += y_stride*4; + v_ptr += y_stride*4; + + mi++; + } + } + + /* Color in frame reference blocks */ + if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag) + { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; + unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; + unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) + { + for (x = 0; x < width; x +=16) + { + int Y = 0, U = 0, V = 0; + + if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame)) + { + Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; + U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; + V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; + + vp8_blend_mb_outer + (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride*16; + u_ptr += y_stride*4; + v_ptr += y_stride*4; + + mi++; + } + } +#endif + + *dest = oci->post_proc_buffer; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + return 0; +} +#endif diff --git a/media/libvpx/vp8/common/postproc.h b/media/libvpx/vp8/common/postproc.h new file mode 100644 index 000000000..0fa12a7c6 --- /dev/null +++ b/media/libvpx/vp8/common/postproc.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_POSTPROC_H_ +#define VP8_COMMON_POSTPROC_H_ + +#include "vpx_ports/mem.h" +struct postproc_state +{ + int last_q; + int last_noise; + char noise[3072]; + int last_base_qindex; + int last_frame_valid; + DECLARE_ALIGNED(16, char, blackclamp[16]); + DECLARE_ALIGNED(16, char, whiteclamp[16]); + DECLARE_ALIGNED(16, char, bothclamp[16]); +}; +#include "onyxc_int.h" +#include "ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif +int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, + vp8_ppflags_t *flags); + + +void vp8_de_noise(struct VP8Common *oci, + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + int uvfilter); + +void vp8_deblock(struct VP8Common *oci, + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag); + +#define MFQE_PRECISION 4 + +void vp8_multiframe_quality_enhance(struct VP8Common *cm); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_POSTPROC_H_ diff --git a/media/libvpx/vp8/common/ppflags.h b/media/libvpx/vp8/common/ppflags.h new file mode 100644 index 000000000..768224aad --- /dev/null +++ b/media/libvpx/vp8/common/ppflags.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_PPFLAGS_H_ +#define VP8_COMMON_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif +enum +{ + VP8D_NOFILTERING = 0, + VP8D_DEBLOCK = 1<<0, + VP8D_DEMACROBLOCK = 1<<1, + VP8D_ADDNOISE = 1<<2, + VP8D_DEBUG_TXT_FRAME_INFO = 1<<3, + VP8D_DEBUG_TXT_MBLK_MODES = 1<<4, + VP8D_DEBUG_TXT_DC_DIFF = 1<<5, + VP8D_DEBUG_TXT_RATE_INFO = 1<<6, + VP8D_DEBUG_DRAW_MV = 1<<7, + VP8D_DEBUG_CLR_BLK_MODES = 1<<8, + VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9, + VP8D_MFQE = 1<<10 +}; + +typedef struct +{ + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp8_ppflags_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_PPFLAGS_H_ diff --git a/media/libvpx/vp8/common/quant_common.c b/media/libvpx/vp8/common/quant_common.c new file mode 100644 index 000000000..05f921070 --- /dev/null +++ b/media/libvpx/vp8/common/quant_common.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "quant_common.h" + +static const int dc_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, + 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, +}; + +static const int ac_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, + 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, + 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, +}; + + +int vp8_dc_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + return retval; +} + +int vp8_dc2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ] * 2; + return retval; + +} +int vp8_dc_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + + if (retval > 132) + retval = 132; + + return retval; +} + +int vp8_ac_yquant(int QIndex) +{ + int retval; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} + +int vp8_ac2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16. + * The smallest precision for that is '(x*6349) >> 12' but 16 is a good + * word size. */ + retval = (ac_qlookup[ QIndex ] * 101581) >> 16; + + if (retval < 8) + retval = 8; + + return retval; +} +int vp8_ac_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} diff --git a/media/libvpx/vp8/common/quant_common.h b/media/libvpx/vp8/common/quant_common.h new file mode 100644 index 000000000..700b5e6d7 --- /dev/null +++ b/media/libvpx/vp8/common/quant_common.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_QUANT_COMMON_H_ +#define VP8_COMMON_QUANT_COMMON_H_ + + +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern int vp8_ac_yquant(int QIndex); +extern int vp8_dc_quant(int QIndex, int Delta); +extern int vp8_dc2quant(int QIndex, int Delta); +extern int vp8_ac2quant(int QIndex, int Delta); +extern int vp8_dc_uv_quant(int QIndex, int Delta); +extern int vp8_ac_uv_quant(int QIndex, int Delta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_QUANT_COMMON_H_ diff --git a/media/libvpx/vp8/common/reconinter.c b/media/libvpx/vp8/common/reconinter.c new file mode 100644 index 000000000..e30259558 --- /dev/null +++ b/media/libvpx/vp8/common/reconinter.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <limits.h> +#include <string.h> + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/vpx_integer.h" +#include "blockd.h" +#include "reconinter.h" +#if CONFIG_RUNTIME_CPU_DETECT +#include "onyxc_int.h" +#endif + +void vp8_copy_mem16x16_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + + int r; + + for (r = 0; r < 16; r++) + { + memcpy(dst, src, 16); + + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x8_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 8; r++) + { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x4_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 4; r++) + { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + + } + +} + + +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *pred_ptr = d->predictor; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + } + else + { + for (r = 0; r < 4; r++) + { + pred_ptr[0] = ptr[0]; + pred_ptr[1] = ptr[1]; + pred_ptr[2] = ptr[2]; + pred_ptr[3] = ptr[3]; + pred_ptr += pitch; + ptr += pre_stride; + } + } +} + +static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride) +{ + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride) +{ + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + for (r = 0; r < 4; r++) + { + dst[0] = ptr[0]; + dst[1] = ptr[1]; + dst[2] = ptr[2]; + dst[3] = ptr[3]; + dst += dst_stride; + ptr += pre_stride; + } + } +} + + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x) +{ + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int offset; + int pre_stride = x->pre.uv_stride; + + /* calc uv motion vectors */ + mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1)); + mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1)); + mv_row /= 2; + mv_col /= 2; + mv_row &= x->fullpixel_mask; + mv_col &= x->fullpixel_mask; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + } + else + { + vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8); + vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8); + } +} + +/*encoder only*/ +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) +{ + int i, j; + int pre_stride = x->pre.uv_stride; + unsigned char *base_pre; + + /* build uv mvs */ + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->block[yoffset ].bmi.mv.as_mv.row + + x->block[yoffset+1].bmi.mv.as_mv.row + + x->block[yoffset+4].bmi.mv.as_mv.row + + x->block[yoffset+5].bmi.mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->block[yoffset ].bmi.mv.as_mv.col + + x->block[yoffset+1].bmi.mv.as_mv.col + + x->block[yoffset+4].bmi.mv.as_mv.col + + x->block[yoffset+5].bmi.mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } + + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + else + { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict); + } + } + + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + else + { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict); + } + } +} + + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride) +{ + unsigned char *ptr_base; + unsigned char *ptr; + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->pre.y_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, + dst_y, dst_ystride); + } + else + { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, + dst_ystride); + } +} + +static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) +{ + /* If the MV points so far into the UMV border that no visible pixels + * are used for reconstruction, the subpel part of the MV can be + * discarded and the MV limited to 16 pixels with equivalent results. + * + * This limit kicks in at 19 pixels for the top and left edges, for + * the 16 pixels plus 3 taps right of the central pixel when subpel + * filtering. The bottom and right edges use 16 pixels plus 2 pixels + * left of the central pixel when filtering. + */ + if (mv->col < (xd->mb_to_left_edge - (19 << 3))) + mv->col = xd->mb_to_left_edge - (16 << 3); + else if (mv->col > xd->mb_to_right_edge + (18 << 3)) + mv->col = xd->mb_to_right_edge + (16 << 3); + + if (mv->row < (xd->mb_to_top_edge - (19 << 3))) + mv->row = xd->mb_to_top_edge - (16 << 3); + else if (mv->row > xd->mb_to_bottom_edge + (18 << 3)) + mv->row = xd->mb_to_bottom_edge + (16 << 3); +} + +/* A version of the above function for chroma block MVs.*/ +static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) +{ + mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ? + (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col; + mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ? + (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col; + + mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ? + (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row; + mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ? + (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row; +} + +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride) +{ + int offset; + unsigned char *ptr; + unsigned char *uptr, *vptr; + + int_mv _16x16mv; + + unsigned char *ptr_base = x->pre.y_buffer; + int pre_stride = x->pre.y_stride; + + _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&_16x16mv.as_mv, x); + } + + ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + + if ( _16x16mv.as_int & 0x00070007) + { + x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_y, dst_ystride); + } + else + { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } + + /* calc uv motion vectors */ + _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + _16x16mv.as_mv.row &= x->fullpixel_mask; + _16x16mv.as_mv.col &= x->fullpixel_mask; + + pre_stride >>= 1; + offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ( _16x16mv.as_int & 0x00070007) + { + x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_u, dst_uvstride); + x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_v, dst_uvstride); + } + else + { + vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); + vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); + } +} + +static void build_inter4x4_predictors_mb(MACROBLOCKD *x) +{ + int i; + unsigned char *base_dst = x->dst.y_buffer; + unsigned char *base_pre = x->pre.y_buffer; + + if (x->mode_info_context->mbmi.partitioning < 3) + { + BLOCKD *b; + int dst_stride = x->dst.y_stride; + + x->block[ 0].bmi = x->mode_info_context->bmi[ 0]; + x->block[ 2].bmi = x->mode_info_context->bmi[ 2]; + x->block[ 8].bmi = x->mode_info_context->bmi[ 8]; + x->block[10].bmi = x->mode_info_context->bmi[10]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x); + } + + b = &x->block[ 0]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[ 2]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[ 8]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[10]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.y_stride; + + x->block[i+0].bmi = x->mode_info_context->bmi[i+0]; + x->block[i+1].bmi = x->mode_info_context->bmi[i+1]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x); + } + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + + } + + } + base_dst = x->dst.u_buffer; + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + } + + base_dst = x->dst.v_buffer; + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + } +} + +static +void build_4x4uvmvs(MACROBLOCKD *x) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x); + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } +} + +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) +{ + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); + } + else + { + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb(xd); + } +} diff --git a/media/libvpx/vp8/common/reconinter.h b/media/libvpx/vp8/common/reconinter.h new file mode 100644 index 000000000..ba979b966 --- /dev/null +++ b/media/libvpx/vp8/common/reconinter.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_RECONINTER_H_ +#define VP8_COMMON_RECONINTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); +extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); + + +extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride); +extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, + unsigned char *base_pre, + int pre_stride, + vp8_subpix_fn_t sppf); + +extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); +extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_RECONINTER_H_ diff --git a/media/libvpx/vp8/common/reconintra.c b/media/libvpx/vp8/common/reconintra.c new file mode 100644 index 000000000..0a6c51b35 --- /dev/null +++ b/media/libvpx/vp8/common/reconintra.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "blockd.h" + +void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + unsigned char yleft_col[16]; + unsigned char ytop_left = yabove_row[-1]; + int r, c, i; + + for (i = 0; i < 16; i++) + { + yleft_col[i] = yleft[i* left_stride]; + } + + /* for Y */ + switch (x->mode_info_context->mbmi.mode) + { + case DC_PRED: + { + int expected_dc; + int shift; + int average = 0; + + + if (x->up_available || x->left_available) + { + if (x->up_available) + { + for (i = 0; i < 16; i++) + { + average += yabove_row[i]; + } + } + + if (x->left_available) + { + + for (i = 0; i < 16; i++) + { + average += yleft_col[i]; + } + + } + + + + shift = 3 + x->up_available + x->left_available; + expected_dc = (average + (1 << (shift - 1))) >> shift; + } + else + { + expected_dc = 128; + } + + /*memset(ypred_ptr, expected_dc, 256);*/ + for (r = 0; r < 16; r++) + { + memset(ypred_ptr, expected_dc, 16); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + + for (r = 0; r < 16; r++) + { + + ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0]; + ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1]; + ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2]; + ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3]; + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + + for (r = 0; r < 16; r++) + { + + memset(ypred_ptr, yleft_col[r], 16); + ypred_ptr += y_stride; + } + + } + break; + case TM_PRED: + { + + for (r = 0; r < 16; r++) + { + for (c = 0; c < 16; c++) + { + int pred = yleft_col[r] + yabove_row[ c] - ytop_left; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + ypred_ptr[c] = pred; + } + + ypred_ptr += y_stride; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + unsigned char uleft_col[8]; + unsigned char utop_left = uabove_row[-1]; + unsigned char vleft_col[8]; + unsigned char vtop_left = vabove_row[-1]; + + int i, j; + + for (i = 0; i < 8; i++) + { + uleft_col[i] = uleft [i* left_stride]; + vleft_col[i] = vleft [i* left_stride]; + } + + switch (x->mode_info_context->mbmi.uv_mode) + { + case DC_PRED: + { + int expected_udc; + int expected_vdc; + int shift; + int Uaverage = 0; + int Vaverage = 0; + + if (x->up_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uabove_row[i]; + Vaverage += vabove_row[i]; + } + } + + if (x->left_available) + { + for (i = 0; i < 8; i++) + { + Uaverage += uleft_col[i]; + Vaverage += vleft_col[i]; + } + } + + if (!x->up_available && !x->left_available) + { + expected_udc = 128; + expected_vdc = 128; + } + else + { + shift = 2 + x->up_available + x->left_available; + expected_udc = (Uaverage + (1 << (shift - 1))) >> shift; + expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift; + } + + + /*memset(upred_ptr,expected_udc,64);*/ + /*memset(vpred_ptr,expected_vdc,64);*/ + for (i = 0; i < 8; i++) + { + memset(upred_ptr, expected_udc, 8); + memset(vpred_ptr, expected_vdc, 8); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + for (i = 0; i < 8; i++) + { + memcpy(upred_ptr, uabove_row, 8); + memcpy(vpred_ptr, vabove_row, 8); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + } + + } + break; + case H_PRED: + { + for (i = 0; i < 8; i++) + { + memset(upred_ptr, uleft_col[i], 8); + memset(vpred_ptr, vleft_col[i], 8); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + } + } + + break; + case TM_PRED: + { + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + int predu = uleft_col[i] + uabove_row[j] - utop_left; + int predv = vleft_col[i] + vabove_row[j] - vtop_left; + + if (predu < 0) + predu = 0; + + if (predu > 255) + predu = 255; + + if (predv < 0) + predv = 0; + + if (predv > 255) + predv = 255; + + upred_ptr[j] = predu; + vpred_ptr[j] = predv; + } + + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + } + + } + break; + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} diff --git a/media/libvpx/vp8/common/reconintra4x4.c b/media/libvpx/vp8/common/reconintra4x4.c new file mode 100644 index 000000000..3d4f2c404 --- /dev/null +++ b/media/libvpx/vp8/common/reconintra4x4.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "blockd.h" + +void vp8_intra4x4_predict_c(unsigned char *Above, + unsigned char *yleft, int left_stride, + int _b_mode, + unsigned char *dst, int dst_stride, + unsigned char top_left) +{ + int i, r, c; + B_PREDICTION_MODE b_mode = (B_PREDICTION_MODE)_b_mode; + unsigned char Left[4]; + Left[0] = yleft[0]; + Left[1] = yleft[left_stride]; + Left[2] = yleft[2 * left_stride]; + Left[3] = yleft[3 * left_stride]; + + switch (b_mode) + { + case B_DC_PRED: + { + int expected_dc = 0; + + for (i = 0; i < 4; i++) + { + expected_dc += Above[i]; + expected_dc += Left[i]; + } + + expected_dc = (expected_dc + 4) >> 3; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + dst[c] = expected_dc; + } + + dst += dst_stride; + } + } + break; + case B_TM_PRED: + { + /* prediction similar to true_motion prediction */ + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int pred = Above[c] - top_left + Left[r]; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + dst[c] = pred; + } + + dst += dst_stride; + } + } + break; + + case B_VE_PRED: + { + + unsigned int ap[4]; + ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2; + ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2; + ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2; + ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + + dst[c] = ap[c]; + } + + dst += dst_stride; + } + + } + break; + + + case B_HE_PRED: + { + + unsigned int lp[4]; + lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2; + lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2; + lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2; + lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + dst[c] = lp[r]; + } + + dst += dst_stride; + } + } + break; + case B_LD_PRED: + { + unsigned char *ptr = Above; + dst[0 * dst_stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + dst[0 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + dst[0 * dst_stride + 2] = + dst[1 * dst_stride + 1] = + dst[2 * dst_stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + dst[0 * dst_stride + 3] = + dst[1 * dst_stride + 2] = + dst[2 * dst_stride + 1] = + dst[3 * dst_stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + dst[1 * dst_stride + 3] = + dst[2 * dst_stride + 2] = + dst[3 * dst_stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[3 * dst_stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + dst[3 * dst_stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + + } + break; + case B_RD_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + dst[3 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[3 * dst_stride + 1] = + dst[2 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[3 * dst_stride + 2] = + dst[2 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[3 * dst_stride + 3] = + dst[2 * dst_stride + 2] = + dst[1 * dst_stride + 1] = + dst[0 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[1 * dst_stride + 2] = + dst[0 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[1 * dst_stride + 3] = + dst[0 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + + } + break; + case B_VR_PRED: + { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[3 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 1] = + dst[0 * dst_stride + 0] = (pp[4] + pp[5] + 1) >> 1; + dst[3 * dst_stride + 2] = + dst[1 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[0 * dst_stride + 1] = (pp[5] + pp[6] + 1) >> 1; + dst[3 * dst_stride + 3] = + dst[1 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[0 * dst_stride + 2] = (pp[6] + pp[7] + 1) >> 1; + dst[1 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[7] + pp[8] + 1) >> 1; + + } + break; + case B_VL_PRED: + { + + unsigned char *pp = Above; + + dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[1 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[2 * dst_stride + 0] = + dst[0 * dst_stride + 1] = (pp[1] + pp[2] + 1) >> 1; + dst[1 * dst_stride + 1] = + dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 1] = + dst[0 * dst_stride + 2] = (pp[2] + pp[3] + 1) >> 1; + dst[3 * dst_stride + 1] = + dst[1 * dst_stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[0 * dst_stride + 3] = + dst[2 * dst_stride + 2] = (pp[3] + pp[4] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[3 * dst_stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[3 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + case B_HD_PRED: + { + unsigned char pp[9]; + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + dst[3 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[3 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[2 * dst_stride + 0] = + dst[3 * dst_stride + 2] = (pp[1] + pp[2] + 1) >> 1; + dst[2 * dst_stride + 1] = + dst[3 * dst_stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[1 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1; + dst[2 * dst_stride + 3] = + dst[1 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[1 * dst_stride + 2] = + dst[0 * dst_stride + 0] = (pp[3] + pp[4] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[0 * dst_stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[0 * dst_stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + + case B_HU_PRED: + { + unsigned char *pp = Left; + dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[0 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[0 * dst_stride + 2] = + dst[1 * dst_stride + 0] = (pp[1] + pp[2] + 1) >> 1; + dst[0 * dst_stride + 3] = + dst[1 * dst_stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[1 * dst_stride + 2] = + dst[2 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[2 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[2 * dst_stride + 3] = + dst[3 * dst_stride + 0] = + dst[3 * dst_stride + 1] = + dst[3 * dst_stride + 2] = + dst[3 * dst_stride + 3] = pp[3]; + } + break; + + default: + break; + + } +} diff --git a/media/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/vp8/common/reconintra4x4.h new file mode 100644 index 000000000..ed59c9edd --- /dev/null +++ b/media/libvpx/vp8/common/reconintra4x4.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_RECONINTRA4X4_H_ +#define VP8_COMMON_RECONINTRA4X4_H_ +#include "vp8/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static void intra_prediction_down_copy(MACROBLOCKD *xd, + unsigned char *above_right_src) +{ + int dst_stride = xd->dst.y_stride; + unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right_src; + unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_RECONINTRA4X4_H_ diff --git a/media/libvpx/vp8/common/rtcd.c b/media/libvpx/vp8/common/rtcd.c new file mode 100644 index 000000000..98c2ecd74 --- /dev/null +++ b/media/libvpx/vp8/common/rtcd.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vp8_rtcd.h" +#include "vpx_ports/vpx_once.h" +#ifdef _MSC_VER +#include <intrin.h> +#endif + + +void vp8_rtcd() +{ + once(setup_rtcd_internal); +} diff --git a/media/libvpx/vp8/common/setupintrarecon.c b/media/libvpx/vp8/common/setupintrarecon.c new file mode 100644 index 000000000..669564db4 --- /dev/null +++ b/media/libvpx/vp8/common/setupintrarecon.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "setupintrarecon.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) +{ + int i; + + /* set up frame new frame for intra coded blocks */ + memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + for (i = 0; i < ybf->y_height; i++) + ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129; + + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + +} + +void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf) +{ + memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); +} diff --git a/media/libvpx/vp8/common/setupintrarecon.h b/media/libvpx/vp8/common/setupintrarecon.h new file mode 100644 index 000000000..608f4a9ac --- /dev/null +++ b/media/libvpx/vp8/common/setupintrarecon.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SETUPINTRARECON_H_ +#define VP8_COMMON_SETUPINTRARECON_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); +extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf); + +static +void setup_intra_recon_left(unsigned char *y_buffer, + unsigned char *u_buffer, + unsigned char *v_buffer, + int y_stride, + int uv_stride) +{ + int i; + + for (i = 0; i < 16; i++) + y_buffer[y_stride *i] = (unsigned char) 129; + + for (i = 0; i < 8; i++) + u_buffer[uv_stride *i] = (unsigned char) 129; + + for (i = 0; i < 8; i++) + v_buffer[uv_stride *i] = (unsigned char) 129; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SETUPINTRARECON_H_ diff --git a/media/libvpx/vp8/common/swapyv12buffer.c b/media/libvpx/vp8/common/swapyv12buffer.c new file mode 100644 index 000000000..73656b3d7 --- /dev/null +++ b/media/libvpx/vp8/common/swapyv12buffer.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "swapyv12buffer.h" + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame) +{ + unsigned char *temp; + + temp = last_frame->buffer_alloc; + last_frame->buffer_alloc = new_frame->buffer_alloc; + new_frame->buffer_alloc = temp; + + temp = last_frame->y_buffer; + last_frame->y_buffer = new_frame->y_buffer; + new_frame->y_buffer = temp; + + temp = last_frame->u_buffer; + last_frame->u_buffer = new_frame->u_buffer; + new_frame->u_buffer = temp; + + temp = last_frame->v_buffer; + last_frame->v_buffer = new_frame->v_buffer; + new_frame->v_buffer = temp; + +} diff --git a/media/libvpx/vp8/common/swapyv12buffer.h b/media/libvpx/vp8/common/swapyv12buffer.h new file mode 100644 index 000000000..1d66cd3d6 --- /dev/null +++ b/media/libvpx/vp8/common/swapyv12buffer.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_SWAPYV12BUFFER_H_ +#define VP8_COMMON_SWAPYV12BUFFER_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/media/libvpx/vp8/common/systemdependent.h b/media/libvpx/vp8/common/systemdependent.h new file mode 100644 index 000000000..3d44e37cf --- /dev/null +++ b/media/libvpx/vp8/common/systemdependent.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VP8_COMMON_SYSTEMDEPENDENT_H_ + +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8Common; +void vp8_machine_specific_config(struct VP8Common *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/media/libvpx/vp8/common/threading.h b/media/libvpx/vp8/common/threading.h new file mode 100644 index 000000000..01c82dbb8 --- /dev/null +++ b/media/libvpx/vp8/common/threading.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_THREADING_H_ +#define VP8_COMMON_THREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +/* Thread management macros */ +#ifdef _WIN32 +/* Win32 */ +#include <process.h> +#include <windows.h> +#define THREAD_FUNCTION DWORD WINAPI +#define THREAD_FUNCTION_RETURN DWORD +#define THREAD_SPECIFIC_INDEX DWORD +#define pthread_t HANDLE +#define pthread_attr_t DWORD +#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL) +#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) +#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread) +#define thread_sleep(nms) Sleep(nms) +#define pthread_cancel(thread) terminate_thread(thread,0) +#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();}; +#define pthread_getspecific(ts_key) TlsGetValue(ts_key) +#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) +#define pthread_self() GetCurrentThreadId() + +#elif defined(__OS2__) +/* OS/2 */ +#define INCL_DOS +#include <os2.h> + +#include <stdlib.h> +#define THREAD_FUNCTION void +#define THREAD_FUNCTION_RETURN void +#define THREAD_SPECIFIC_INDEX PULONG +#define pthread_t TID +#define pthread_attr_t ULONG +#define pthread_create(thhandle,attr,thfunc,tharg) \ + ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1)) +#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0)) +#define pthread_detach(thread) 0 +#define thread_sleep(nms) DosSleep(nms) +#define pthread_cancel(thread) DosKillThread(thread) +#define ts_key_create(ts_key, destructor) \ + DosAllocThreadLocalMemory(1, &(ts_key)); +#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) +#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value)) +#define pthread_self() _gettid() +#else +#ifdef __APPLE__ +#include <mach/mach_init.h> +#include <mach/semaphore.h> +#include <mach/task.h> +#include <time.h> +#include <unistd.h> + +#else +#include <semaphore.h> +#endif + +#include <pthread.h> +/* pthreads */ +/* Nearly everything is already defined */ +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX pthread_key_t +#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor); +#endif + +/* Syncrhronization macros: Win32 and Pthreads */ +#ifdef _WIN32 +#define sem_t HANDLE +#define pause(voidpara) __asm PAUSE +#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL) +#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE)) +#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL) +#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE) +#define thread_sleep(nms) Sleep(nms) + +#elif defined(__OS2__) +typedef struct +{ + HEV event; + HMTX wait_mutex; + HMTX count_mutex; + int count; +} sem_t; + +static inline int sem_init(sem_t *sem, int pshared, unsigned int value) +{ + DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, + value > 0 ? TRUE : FALSE); + DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); + DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); + + sem->count = value; + + return 0; +} + +static inline int sem_wait(sem_t * sem) +{ + DosRequestMutexSem(sem->wait_mutex, -1); + + DosWaitEventSem(sem->event, -1); + + DosRequestMutexSem(sem->count_mutex, -1); + + sem->count--; + if (sem->count == 0) + { + ULONG post_count; + + DosResetEventSem(sem->event, &post_count); + } + + DosReleaseMutexSem(sem->count_mutex); + + DosReleaseMutexSem(sem->wait_mutex); + + return 0; +} + +static inline int sem_post(sem_t * sem) +{ + DosRequestMutexSem(sem->count_mutex, -1); + + if (sem->count < 32768) + { + sem->count++; + DosPostEventSem(sem->event); + } + + DosReleaseMutexSem(sem->count_mutex); + + return 0; +} + +static inline int sem_destroy(sem_t * sem) +{ + DosCloseEventSem(sem->event); + DosCloseMutexSem(sem->wait_mutex); + DosCloseMutexSem(sem->count_mutex); + + return 0; +} + +#define thread_sleep(nms) DosSleep(nms) + +#else + +#ifdef __APPLE__ +#define sem_t semaphore_t +#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) +#define sem_wait(sem) (semaphore_wait(*sem) ) +#define sem_post(sem) semaphore_signal(*sem) +#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem) +#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#else +#include <unistd.h> +#include <sched.h> +#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#endif +/* Not Windows. Assume pthreads */ + +#endif + +#if ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#else +#define x86_pause_hint() +#endif + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_THREADING_H_ diff --git a/media/libvpx/vp8/common/treecoder.c b/media/libvpx/vp8/common/treecoder.c new file mode 100644 index 000000000..d80c64bdf --- /dev/null +++ b/media/libvpx/vp8/common/treecoder.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#if CONFIG_DEBUG +#include <assert.h> +#endif +#include <stdio.h> + +#include "treecoder.h" + +static void tree2tok( + struct vp8_token_struct *const p, + vp8_tree t, + int i, + int v, + int L +) +{ + v += v; + ++L; + + do + { + const vp8_tree_index j = t[i++]; + + if (j <= 0) + { + p[-j].value = v; + p[-j].Len = L; + } + else + tree2tok(p, t, j, v, L); + } + while (++v & 1); +} + +void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t) +{ + tree2tok(p, t, 0, 0, 0); +} + +void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t, + int offset) +{ + tree2tok(p - offset, t, 0, 0, 0); +} + +static void branch_counts( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ] +) +{ + const int tree_len = n - 1; + int t = 0; + +#if CONFIG_DEBUG + assert(tree_len); +#endif + + do + { + branch_ct[t][0] = branch_ct[t][1] = 0; + } + while (++t < tree_len); + + t = 0; + + do + { + int L = tok[t].Len; + const int enc = tok[t].value; + const unsigned int ct = num_events[t]; + + vp8_tree_index i = 0; + + do + { + const int b = (enc >> --L) & 1; + const int j = i >> 1; +#if CONFIG_DEBUG + assert(j < tree_len && 0 <= L); +#endif + + branch_ct [j] [b] += ct; + i = tree[ i + b]; + } + while (i > 0); + +#if CONFIG_DEBUG + assert(!L); +#endif + } + while (++t < n); + +} + + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfac, + int rd +) +{ + const int tree_len = n - 1; + int t = 0; + + branch_counts(n, tok, tree, branch_ct, num_events); + + do + { + const unsigned int *const c = branch_ct[t]; + const unsigned int tot = c[0] + c[1]; + +#if CONFIG_DEBUG + assert(tot < (1 << 24)); /* no overflow below */ +#endif + + if (tot) + { + const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ + } + else + probs[t] = vp8_prob_half; + } + while (++t < tree_len); +} diff --git a/media/libvpx/vp8/common/treecoder.h b/media/libvpx/vp8/common/treecoder.h new file mode 100644 index 000000000..d22b7c570 --- /dev/null +++ b/media/libvpx/vp8/common/treecoder.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_TREECODER_H_ +#define VP8_COMMON_TREECODER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned char vp8bc_index_t; /* probability index */ + + +typedef unsigned char vp8_prob; + +#define vp8_prob_half ( (vp8_prob) 128) + +typedef signed char vp8_tree_index; +struct bool_coder_spec; + +typedef struct bool_coder_spec bool_coder_spec; +typedef struct bool_writer bool_writer; +typedef struct bool_reader bool_reader; + +typedef const bool_coder_spec c_bool_coder_spec; +typedef const bool_writer c_bool_writer; +typedef const bool_reader c_bool_reader; + + + +# define vp8_complement( x) (255 - x) + + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp8_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp8_tree_index vp8_tree[], *vp8_tree_p; + + +typedef const struct vp8_token_struct +{ + int value; + int Len; +} vp8_token; + +/* Construct encoding array from tree. */ + +void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree); +void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree, + int offset); + + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfactor, + int Round +); + +/* Variant of above using coder spec rather than hardwired 8-bit probs. */ + +void vp8bc_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + c_bool_coder_spec *s +); + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_TREECODER_H_ diff --git a/media/libvpx/vp8/common/variance.h b/media/libvpx/vp8/common/variance.h new file mode 100644 index 000000000..c6c9f41bf --- /dev/null +++ b/media/libvpx/vp8/common/variance.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_VARIANCE_H_ +#define VP8_COMMON_VARIANCE_H_ + +#include "vpx_config.h" + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int(*vpx_sad_fn_t)( + const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int ref_stride); + +typedef void (*vp8_copy32xn_fn_t)( + const unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int ref_stride, + int n); + +typedef void (*vpx_sad_multi_fn_t)( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_array, + int ref_stride, + unsigned int *sad_array); + +typedef void (*vpx_sad_multi_d_fn_t) + ( + const unsigned char *src_ptr, + int source_stride, + const unsigned char * const ref_array[], + int ref_stride, + unsigned int *sad_array + ); + +typedef unsigned int (*vpx_variance_fn_t) + ( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned int *sse + ); + +typedef unsigned int (*vp8_subpixvariance_fn_t) + ( + const unsigned char *src_ptr, + int source_stride, + int xoffset, + int yoffset, + const unsigned char *ref_ptr, + int Refstride, + unsigned int *sse + ); + +typedef struct variance_vtable +{ + vpx_sad_fn_t sdf; + vpx_variance_fn_t vf; + vp8_subpixvariance_fn_t svf; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; + vpx_sad_multi_fn_t sdx3f; + vpx_sad_multi_fn_t sdx8f; + vpx_sad_multi_d_fn_t sdx4df; +#if ARCH_X86 || ARCH_X86_64 + vp8_copy32xn_fn_t copymem; +#endif +} vp8_variance_fn_ptr_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_VARIANCE_H_ diff --git a/media/libvpx/vp8/common/variance_c.c b/media/libvpx/vp8/common/variance_c.c new file mode 100644 index 000000000..02915a4de --- /dev/null +++ b/media/libvpx/vp8/common/variance_c.c @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "filter.h" +#include "variance.h" + +/* This is a bad idea. + * ctz = count trailing zeros */ +static int ctz(int a) { + int b = 0; + while (a != 1) { + a >>= 1; + b++; + } + return b; +} + +static unsigned int variance( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse) +{ + int i, j; + int diff, sum; + + sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) + { + for (j = 0; j < w; j++) + { + diff = src_ptr[j] - ref_ptr[j]; + sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } + + return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_first_pass +( + const unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + /* Apply bilinear filter */ + output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_second_pass +( + const unsigned short *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[pixel_step] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + + +unsigned int vp8_sub_pixel_variance4x4_c +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned char temp2[20*16]; + const short *HFilter, *VFilter; + unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */ + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + /* First filter 1d Horizontal */ + var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); + + /* Now filter Verticaly */ + var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); + + return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); +} + + +unsigned int vp8_sub_pixel_variance8x8_c +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */ + unsigned char temp2[20*16]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); + + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); +} + +unsigned int vp8_sub_pixel_variance16x16_c +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */ + unsigned char temp2[20*16]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); + + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_h_c( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_c( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_c( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_sub_pixel_variance16x8_c +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */ + unsigned char temp2[20*16]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); + + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); +} + +unsigned int vp8_sub_pixel_variance8x16_c +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */ + unsigned char temp2[20*16]; + const short *HFilter, *VFilter; + + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + + var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); + + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); +} diff --git a/media/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/vp8/common/vp8_entropymodedata.h new file mode 100644 index 000000000..c4aed4989 --- /dev/null +++ b/media/libvpx/vp8/common/vp8_entropymodedata.h @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropymode.c*/ + + +const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 28, 5 }, + { 30, 5 }, + { 58, 6 }, + { 59, 6 }, + { 62, 6 }, + { 126, 7 }, + { 127, 7 } +}; + +const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] = +{ + { 0, 1 }, + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] = +{ + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 }, + { 0, 1 } +}; + +const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] = +{ + { 6, 3 }, + { 7, 3 }, + { 2, 2 }, + { 0, 1 } +}; + +const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] = +{ + { 2, 2 }, + { 6, 3 }, + { 0, 1 }, + { 14, 4 }, + { 15, 4 } +}; + +const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_small_mvencodings[8] = +{ + { 0, 3 }, + { 1, 3 }, + { 2, 3 }, + { 3, 3 }, + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 } +}; + +const vp8_prob vp8_ymode_prob[VP8_YMODES-1] = +{ + 112, 86, 140, 37 +}; + +const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] = +{ + 145, 156, 163, 128 +}; + +const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] = +{ + 162, 101, 204 +}; + +const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] = +{ + 142, 114, 183 +}; + +const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] = +{ + 120, 90, 79, 133, 87, 85, 80, 111, 151 +}; + + + +const vp8_prob vp8_kf_bmode_prob +[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] = +{ + { + { 231, 120, 48, 89, 115, 113, 120, 152, 112 }, + { 152, 179, 64, 126, 170, 118, 46, 70, 95 }, + { 175, 69, 143, 80, 85, 82, 72, 155, 103 }, + { 56, 58, 10, 171, 218, 189, 17, 13, 152 }, + { 144, 71, 10, 38, 171, 213, 144, 34, 26 }, + { 114, 26, 17, 163, 44, 195, 21, 10, 173 }, + { 121, 24, 80, 195, 26, 62, 44, 64, 85 }, + { 170, 46, 55, 19, 136, 160, 33, 206, 71 }, + { 63, 20, 8, 114, 114, 208, 12, 9, 226 }, + { 81, 40, 11, 96, 182, 84, 29, 16, 36 } + }, + { + { 134, 183, 89, 137, 98, 101, 106, 165, 148 }, + { 72, 187, 100, 130, 157, 111, 32, 75, 80 }, + { 66, 102, 167, 99, 74, 62, 40, 234, 128 }, + { 41, 53, 9, 178, 241, 141, 26, 8, 107 }, + { 104, 79, 12, 27, 217, 255, 87, 17, 7 }, + { 74, 43, 26, 146, 73, 166, 49, 23, 157 }, + { 65, 38, 105, 160, 51, 52, 31, 115, 128 }, + { 87, 68, 71, 44, 114, 51, 15, 186, 23 }, + { 47, 41, 14, 110, 182, 183, 21, 17, 194 }, + { 66, 45, 25, 102, 197, 189, 23, 18, 22 } + }, + { + { 88, 88, 147, 150, 42, 46, 45, 196, 205 }, + { 43, 97, 183, 117, 85, 38, 35, 179, 61 }, + { 39, 53, 200, 87, 26, 21, 43, 232, 171 }, + { 56, 34, 51, 104, 114, 102, 29, 93, 77 }, + { 107, 54, 32, 26, 51, 1, 81, 43, 31 }, + { 39, 28, 85, 171, 58, 165, 90, 98, 64 }, + { 34, 22, 116, 206, 23, 34, 43, 166, 73 }, + { 68, 25, 106, 22, 64, 171, 36, 225, 114 }, + { 34, 19, 21, 102, 132, 188, 16, 76, 124 }, + { 62, 18, 78, 95, 85, 57, 50, 48, 51 } + }, + { + { 193, 101, 35, 159, 215, 111, 89, 46, 111 }, + { 60, 148, 31, 172, 219, 228, 21, 18, 111 }, + { 112, 113, 77, 85, 179, 255, 38, 120, 114 }, + { 40, 42, 1, 196, 245, 209, 10, 25, 109 }, + { 100, 80, 8, 43, 154, 1, 51, 26, 71 }, + { 88, 43, 29, 140, 166, 213, 37, 43, 154 }, + { 61, 63, 30, 155, 67, 45, 68, 1, 209 }, + { 142, 78, 78, 16, 255, 128, 34, 197, 171 }, + { 41, 40, 5, 102, 211, 183, 4, 1, 221 }, + { 51, 50, 17, 168, 209, 192, 23, 25, 82 } + }, + { + { 125, 98, 42, 88, 104, 85, 117, 175, 82 }, + { 95, 84, 53, 89, 128, 100, 113, 101, 45 }, + { 75, 79, 123, 47, 51, 128, 81, 171, 1 }, + { 57, 17, 5, 71, 102, 57, 53, 41, 49 }, + { 115, 21, 2, 10, 102, 255, 166, 23, 6 }, + { 38, 33, 13, 121, 57, 73, 26, 1, 85 }, + { 41, 10, 67, 138, 77, 110, 90, 47, 114 }, + { 101, 29, 16, 10, 85, 128, 101, 196, 26 }, + { 57, 18, 10, 102, 102, 213, 34, 20, 43 }, + { 117, 20, 15, 36, 163, 128, 68, 1, 26 } + }, + { + { 138, 31, 36, 171, 27, 166, 38, 44, 229 }, + { 67, 87, 58, 169, 82, 115, 26, 59, 179 }, + { 63, 59, 90, 180, 59, 166, 93, 73, 154 }, + { 40, 40, 21, 116, 143, 209, 34, 39, 175 }, + { 57, 46, 22, 24, 128, 1, 54, 17, 37 }, + { 47, 15, 16, 183, 34, 223, 49, 45, 183 }, + { 46, 17, 33, 183, 6, 98, 15, 32, 183 }, + { 65, 32, 73, 115, 28, 128, 23, 128, 205 }, + { 40, 3, 9, 115, 51, 192, 18, 6, 223 }, + { 87, 37, 9, 115, 59, 77, 64, 21, 47 } + }, + { + { 104, 55, 44, 218, 9, 54, 53, 130, 226 }, + { 64, 90, 70, 205, 40, 41, 23, 26, 57 }, + { 54, 57, 112, 184, 5, 41, 38, 166, 213 }, + { 30, 34, 26, 133, 152, 116, 10, 32, 134 }, + { 75, 32, 12, 51, 192, 255, 160, 43, 51 }, + { 39, 19, 53, 221, 26, 114, 32, 73, 255 }, + { 31, 9, 65, 234, 2, 15, 1, 118, 73 }, + { 88, 31, 35, 67, 102, 85, 55, 186, 85 }, + { 56, 21, 23, 111, 59, 205, 45, 37, 192 }, + { 55, 38, 70, 124, 73, 102, 1, 34, 98 } + }, + { + { 102, 61, 71, 37, 34, 53, 31, 243, 192 }, + { 69, 60, 71, 38, 73, 119, 28, 222, 37 }, + { 68, 45, 128, 34, 1, 47, 11, 245, 171 }, + { 62, 17, 19, 70, 146, 85, 55, 62, 70 }, + { 75, 15, 9, 9, 64, 255, 184, 119, 16 }, + { 37, 43, 37, 154, 100, 163, 85, 160, 1 }, + { 63, 9, 92, 136, 28, 64, 32, 201, 85 }, + { 86, 6, 28, 5, 64, 255, 25, 248, 1 }, + { 56, 8, 17, 132, 137, 255, 55, 116, 128 }, + { 58, 15, 20, 82, 135, 57, 26, 121, 40 } + }, + { + { 164, 50, 31, 137, 154, 133, 25, 35, 218 }, + { 51, 103, 44, 131, 131, 123, 31, 6, 158 }, + { 86, 40, 64, 135, 148, 224, 45, 183, 128 }, + { 22, 26, 17, 131, 240, 154, 14, 1, 209 }, + { 83, 12, 13, 54, 192, 255, 68, 47, 28 }, + { 45, 16, 21, 91, 64, 222, 7, 1, 197 }, + { 56, 21, 39, 155, 60, 138, 23, 102, 213 }, + { 85, 26, 85, 85, 128, 128, 32, 146, 171 }, + { 18, 11, 7, 63, 144, 171, 4, 4, 246 }, + { 35, 27, 10, 146, 174, 171, 12, 26, 128 } + }, + { + { 190, 80, 35, 99, 180, 80, 126, 54, 45 }, + { 85, 126, 47, 87, 176, 51, 41, 20, 32 }, + { 101, 75, 128, 139, 118, 146, 116, 128, 85 }, + { 56, 41, 15, 176, 236, 85, 37, 9, 62 }, + { 146, 36, 19, 30, 171, 255, 97, 27, 20 }, + { 71, 30, 17, 119, 118, 255, 17, 18, 138 }, + { 101, 38, 60, 138, 55, 70, 43, 26, 142 }, + { 138, 45, 61, 62, 219, 1, 81, 188, 64 }, + { 32, 41, 20, 117, 151, 142, 20, 21, 163 }, + { 112, 19, 12, 61, 195, 128, 48, 4, 24 } + } +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/media/libvpx/vp8/common/x86/copy_sse2.asm b/media/libvpx/vp8/common/x86/copy_sse2.asm new file mode 100644 index 000000000..86fae2695 --- /dev/null +++ b/media/libvpx/vp8/common/x86/copy_sse2.asm @@ -0,0 +1,93 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) PRIVATE +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/copy_sse3.asm b/media/libvpx/vp8/common/x86/copy_sse3.asm new file mode 100644 index 000000000..d789a40cc --- /dev/null +++ b/media/libvpx/vp8/common/x86/copy_sse3.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) PRIVATE +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/vp8/common/x86/dequantize_mmx.asm new file mode 100644 index 000000000..4e551f00a --- /dev/null +++ b/media/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -0,0 +1,258 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) +global sym(vp8_dequantize_b_impl_mmx) PRIVATE +sym(vp8_dequantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;sq + mov rdi, arg(1) ;dq + mov rax, arg(2) ;q + + movq mm1, [rsi] + pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. + movq [rdi], mm1 + + movq mm1, [rsi+8] + pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. + movq [rdi+8], mm1 + + movq mm1, [rsi+16] + pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. + movq [rdi+16], mm1 + + movq mm1, [rsi+24] + pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. + movq [rdi+24], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void dequant_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride) 3 +global sym(vp8_dequant_idct_add_mmx) PRIVATE +sym(vp8_dequant_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rdx, arg(1) ;dq + + + movq mm0, [rax ] + pmullw mm0, [rdx] + + movq mm1, [rax +8] + pmullw mm1, [rdx +8] + + movq mm2, [rax+16] + pmullw mm2, [rdx+16] + + movq mm3, [rax+24] + pmullw mm3, [rdx+24] + + mov rdx, arg(2) ;dest + + pxor mm7, mm7 + + + movq [rax], mm7 + movq [rax+8], mm7 + + movq [rax+16],mm7 + movq [rax+24],mm7 + + + movsxd rdi, dword ptr arg(3) ;stride + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rdx] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rdx+rdi] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/vp8/common/x86/filter_x86.c b/media/libvpx/vp8/common/x86/filter_x86.c new file mode 100644 index 000000000..7f496ed7d --- /dev/null +++ b/media/libvpx/vp8/common/x86/filter_x86.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/x86/filter_x86.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; diff --git a/media/libvpx/vp8/common/x86/filter_x86.h b/media/libvpx/vp8/common/x86/filter_x86.h new file mode 100644 index 000000000..d282841be --- /dev/null +++ b/media/libvpx/vp8/common/x86/filter_x86.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_X86_FILTER_X86_H_ +#define VP8_COMMON_X86_FILTER_X86_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with + * duplicated values */ + +/* duplicated 4x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); + +/* duplicated 8x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/media/libvpx/vp8/common/x86/idct_blk_mmx.c b/media/libvpx/vp8/common/x86/idct_blk_mmx.c new file mode 100644 index 000000000..f2532b34d --- /dev/null +++ b/media/libvpx/vp8/common/x86/idct_blk_mmx.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); + +void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) +{ + short *sq = (short *) d->qcoeff; + short *dq = (short *) d->dqcoeff; + + vp8_dequantize_b_impl_mmx(sq, dq, DQC); +} + +void vp8_dequant_idct_add_y_block_mmx + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride, + dst+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride, + dst+8, stride); + memset(q + 32, 0, 2 * sizeof(q[0])); + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride, + dst+12, stride); + memset(q + 48, 0, 2 * sizeof(q[0])); + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_mmx + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/media/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/vp8/common/x86/idct_blk_sse2.c new file mode 100644 index 000000000..ae96ec858 --- /dev/null +++ b/media/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +void vp8_idct_dequant_0_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); +void vp8_idct_dequant_full_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_y_block_sse2 + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride); + } + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride); + else + vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride); + } + q += 64; + dst += stride*4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_sse2 + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + dstu += stride*4; + + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)(eobs))[2] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } + q += 32; + dstv += stride*4; + + if (((short *)(eobs))[3]) + { + if (((short *)(eobs))[3] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } +} diff --git a/media/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/vp8/common/x86/idctllm_mmx.asm new file mode 100644 index 000000000..96fa2c60d --- /dev/null +++ b/media/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + + +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, +;int pitch, unsigned char *dest,int stride) +global sym(vp8_short_idct4x4llm_mmx) PRIVATE +sym(vp8_short_idct4x4llm_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rsi, arg(1) ;pred + + movq mm0, [rax ] + movq mm1, [rax+ 8] + movq mm2, [rax+16] + movq mm3, [rax+24] + +%if 0 + pxor mm7, mm7 + movq [rax], mm7 + movq [rax+8], mm7 + movq [rax+16],mm7 + movq [rax+24],mm7 +%endif + movsxd rax, dword ptr arg(2) ;pitch + mov rdx, arg(3) ;dest + movsxd rdi, dword ptr arg(4) ;stride + + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rsi] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rsi+rax] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + add rsi, rax + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_dc_only_idct_add_mmx( +;short input_dc, +;unsigned char *pred_ptr, +;int pred_stride, +;unsigned char *dst_ptr, +;int stride) +global sym(vp8_dc_only_idct_add_mmx) PRIVATE +sym(vp8_dc_only_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + movd mm5, arg(0) ;input_dc + mov rax, arg(1) ;pred_ptr + movsxd rdx, dword ptr arg(2) ;pred_stride + + pxor mm0, mm0 + + paddw mm5, [GLOBAL(fours)] + lea rcx, [rdx + rdx*2] + + psraw mm5, 3 + + punpcklwd mm5, mm5 + + punpckldq mm5, mm5 + + movd mm1, [rax] + movd mm2, [rax+rdx] + movd mm3, [rax+2*rdx] + movd mm4, [rax+rcx] + + mov rax, arg(3) ;d -- destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + punpcklbw mm1, mm0 + paddsw mm1, mm5 + packuswb mm1, mm0 ; pack and unpack to saturate + lea rcx, [rdx + rdx*2] + + punpcklbw mm2, mm0 + paddsw mm2, mm5 + packuswb mm2, mm0 ; pack and unpack to saturate + + punpcklbw mm3, mm0 + paddsw mm3, mm5 + packuswb mm3, mm0 ; pack and unpack to saturate + + punpcklbw mm4, mm0 + paddsw mm4, mm5 + packuswb mm4, mm0 ; pack and unpack to saturate + + movd [rax], mm1 + movd [rax+rdx], mm2 + movd [rax+2*rdx], mm3 + movd [rax+rcx], mm4 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/vp8/common/x86/idctllm_sse2.asm new file mode 100644 index 000000000..bf8e2c402 --- /dev/null +++ b/media/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -0,0 +1,708 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) + +global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; clear coeffs + movd [rax], xmm5 + movd [rax+32], xmm5 +;pshufb + mov rax, arg(2) ; dst + movsxd rdx, dword ptr arg(3) ; dst_stride + + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + lea rcx, [rdx + rdx*2] + paddw xmm4, [GLOBAL(fours)] + + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rdx] + movq xmm2, [rax+2*rdx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) +global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + mov rdi, arg(2) ; dst + + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + lea rcx, [rdx + rdx*2] ;dst_stride * 3 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+2*rdx] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + + mov rdi, arg(2) ; dst + mov rdx, arg(4) ; dc + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + movsxd rdx, dword ptr arg(3) ; dst_stride + lea rcx, [rdx + rdx*2] + ; Load up predict blocks + movq xmm0, [rdi] + movq xmm1, [rdi+rdx*1] + movq xmm2, [rdi+rdx*2] + movq xmm3, [rdi+rcx] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [GLOBAL(fours)] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_idct_dequant_dc_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + + mov rdi, arg(2) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(4) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movsxd rdx, dword ptr arg(3) ; dst_stride + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + lea rcx, [rdx + rdx*2] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+rdx*2] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/media/libvpx/vp8/common/x86/iwalsh_mmx.asm b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm new file mode 100644 index 000000000..158c3b745 --- /dev/null +++ b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm @@ -0,0 +1,140 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE +sym(vp8_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rdx, arg(0) + mov rax, 30003h + + movq mm0, [rdx + 0] ;ip[0] + movq mm1, [rdx + 8] ;ip[4] + movq mm7, rax + + movq mm2, [rdx + 16] ;ip[8] + movq mm3, [rdx + 24] ;ip[12] + punpcklwd mm7, mm7 ;0003000300030003h + mov rdx, arg(1) + + movq mm4, mm0 + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + paddw mm1, mm7 + paddw mm6, mm7 + psraw mm1, 3 + psraw mm6, 3 + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl + paddw mm0, mm7 + paddw mm5, mm7 + psraw mm0, 3 + psraw mm5, 3 +;~~~~~~~~~~~~~~~~~~~~~ + + movd eax, mm1 + movd ecx, mm0 + psrlq mm0, 32 + psrlq mm1, 32 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*1], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*5], cx + movd eax, mm1 + movd ecx, mm0 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*9], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*13], cx + + movd eax, mm6 + movd ecx, mm5 + psrlq mm5, 32 + psrlq mm6, 32 + mov word ptr[rdx+32*2], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*6], ax + mov word ptr[rdx+32*7], cx + movd eax, mm6 + movd ecx, mm5 + mov word ptr[rdx+32*10], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*14], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + diff --git a/media/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/vp8/common/x86/iwalsh_sse2.asm new file mode 100644 index 000000000..06e86a80b --- /dev/null +++ b/media/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -0,0 +1,121 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE +sym(vp8_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h + + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] + + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm new file mode 100644 index 000000000..6d5aaa19d --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -0,0 +1,815 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro LF_ABS 2 + ; %1 value not preserved + ; %2 value preserved + ; output in %1 + movdqa scratch1, %2 ; v2 + + psubusb scratch1, %1 ; v2 - v1 + psubusb %1, %2 ; v1 - v2 + por %1, scratch1 ; abs(v2 - v1) +%endmacro + +%macro LF_FILTER_HEV_MASK 8-9 + + LF_ABS %1, %2 ; abs(p3 - p2) + LF_ABS %2, %3 ; abs(p2 - p1) + pmaxub %1, %2 ; accumulate mask +%if %0 == 8 + movdqa scratch2, %3 ; save p1 + LF_ABS scratch2, %4 ; abs(p1 - p0) +%endif + LF_ABS %4, %5 ; abs(p0 - q0) + LF_ABS %5, %6 ; abs(q0 - q1) +%if %0 == 8 + pmaxub %5, scratch2 ; accumulate hev +%else + pmaxub %5, %9 +%endif + pmaxub %1, %5 ; accumulate mask + + LF_ABS %3, %6 ; abs(p1 - q1) + LF_ABS %6, %7 ; abs(q1 - q2) + pmaxub %1, %6 ; accumulate mask + LF_ABS %7, %8 ; abs(q2 - q3) + pmaxub %1, %7 ; accumulate mask + + paddusb %4, %4 ; 2 * abs(p0 - q0) + pand %3, [GLOBAL(tfe)] + psrlw %3, 1 ; abs(p1 - q1) / 2 + paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + psubusb %1, [limit] + psubusb %4, [blimit] + por %1, %4 + pcmpeqb %1, zero ; mask + + psubusb %5, [thresh] + pcmpeqb %5, zero ; ~hev +%endmacro + +%macro LF_FILTER 6 + ; %1-%4: p1-q1 + ; %5: mask + ; %6: hev + + movdqa scratch2, %6 ; save hev + + pxor %1, [GLOBAL(t80)] ; ps1 + pxor %4, [GLOBAL(t80)] ; qs1 + movdqa scratch1, %1 + psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) + pandn scratch2, scratch1 ; vp8_filter &= hev + + pxor %2, [GLOBAL(t80)] ; ps0 + pxor %3, [GLOBAL(t80)] ; qs0 + movdqa scratch1, %3 + psubsb scratch1, %2 ; qs0 - ps0 + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + pand %5, scratch2 ; &= mask + + movdqa scratch2, %5 + paddsb %5, [GLOBAL(t4)] ; Filter1 + paddsb scratch2, [GLOBAL(t3)] ; Filter2 + + ; Filter1 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 3 + pand scratch1, [GLOBAL(te0)] + pand %5, [GLOBAL(t1f)] + por %5, scratch1 + + psubsb %3, %5 ; qs0 - Filter1 + pxor %3, [GLOBAL(t80)] + + ; Filter2 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, scratch2 + psrlw scratch2, 3 + pand scratch1, [GLOBAL(te0)] + pand scratch2, [GLOBAL(t1f)] + por scratch2, scratch1 + + paddsb %2, scratch2 ; ps0 + Filter2 + pxor %2, [GLOBAL(t80)] + + ; outer tap adjustments + paddsb %5, [GLOBAL(t1)] + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 1 + pand scratch1, [GLOBAL(t80)] + pand %5, [GLOBAL(t7f)] + por %5, scratch1 + pand %5, %6 ; vp8_filter &= ~hev + + psubsb %4, %5 ; qs1 - vp8_filter + pxor %4, [GLOBAL(t80)] + + paddsb %1, %5 ; ps1 + vp8_filter + pxor %1, [GLOBAL(t80)] +%endmacro + +;void vp8_loop_filter_bh_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) +global sym(vp8_loop_filter_bh_y_sse2) PRIVATE +sym(vp8_loop_filter_bh_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 11 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi ; src_ptr + %define stride rsi ; src_pixel_step + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define i0 [src] + %define i1 [spp] + %define i2 [src + 2 * stride] + %define i3 [spp + 2 * stride] + %define i4 [src + 4 * stride] + %define i5 [spp + 4 * stride] + %define i6 [src + 2 * stride3] + %define i7 [spp + 2 * stride3] + %define i8 [src + 8 * stride] + %define i9 [spp + 8 * stride] + %define i10 [src + 2 * stride5] + %define i11 [spp + 2 * stride5] + %define i12 [src + 4 * stride3] + %define i13 [spp + 4 * stride3] + %define i14 [src + 2 * stride7] + %define i15 [spp + 2 * stride7] + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + pxor zero, zero + + ; load the first set into registers + movdqa xmm0, i0 + movdqa xmm1, i1 + movdqa xmm2, i2 + movdqa xmm3, i3 + movdqa xmm4, i4 + movdqa xmm8, i5 + movdqa xmm9, i6 ; q2, will contain abs(p1-p0) + movdqa xmm10, i7 +LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm3, i4 + movdqa xmm8, i5 +LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm3 + movdqa i5, xmm8 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm4, i8 + movdqa xmm8, i9 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm4 + movdqa i9, xmm8 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm3, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm3, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm3 + movdqa i13, xmm8 + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + + +;void vp8_loop_filter_bv_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) + +global sym(vp8_loop_filter_bv_y_sse2) PRIVATE +sym(vp8_loop_filter_bv_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 15 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi + %define stride rsi + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define s0 [src] + %define s1 [spp] + %define s2 [src + 2 * stride] + %define s3 [spp + 2 * stride] + %define s4 [src + 4 * stride] + %define s5 [spp + 4 * stride] + %define s6 [src + 2 * stride3] + %define s7 [spp + 2 * stride3] + %define s8 [src + 8 * stride] + %define s9 [spp + 8 * stride] + %define s10 [src + 2 * stride5] + %define s11 [spp + 2 * stride5] + %define s12 [src + 4 * stride3] + %define s13 [spp + 4 * stride3] + %define s14 [src + 2 * stride7] + %define s15 [spp + 2 * stride7] + + %define i0 [rsp] + %define i1 [rsp + 16] + %define i2 [rsp + 32] + %define i3 [rsp + 48] + %define i4 [rsp + 64] + %define i5 [rsp + 80] + %define i6 [rsp + 96] + %define i7 [rsp + 112] + %define i8 [rsp + 128] + %define i9 [rsp + 144] + %define i10 [rsp + 160] + %define i11 [rsp + 176] + %define i12 [rsp + 192] + %define i13 [rsp + 208] + %define i14 [rsp + 224] + %define i15 [rsp + 240] + + ALIGN_STACK 16, rax + + ; reserve stack space + %define temp_storage 0 ; size is 256 (16*16) + %define stack_size 256 + sub rsp, stack_size + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + + ; 8-f + movdqa xmm0, s8 + movdqa xmm1, xmm0 + punpcklbw xmm0, s9 ; 80 90 + punpckhbw xmm1, s9 ; 88 98 + + movdqa xmm2, s10 + movdqa xmm3, xmm2 + punpcklbw xmm2, s11 ; a0 b0 + punpckhbw xmm3, s11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s12 + movdqa xmm5, xmm3 + punpcklbw xmm3, s13 ; c0 d0 + punpckhbw xmm5, s13 ; c8 d8 + + movdqa xmm6, s14 + movdqa xmm7, xmm6 + punpcklbw xmm6, s15 ; e0 f0 + punpckhbw xmm7, s15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i0, xmm0 + movdqa i1, xmm7 + movdqa i2, xmm4 + movdqa i3, xmm3 + movdqa i4, xmm1 + movdqa i5, xmm8 + movdqa i6, xmm2 + movdqa i7, xmm5 + + ; 0-7 + movdqa xmm0, s0 + movdqa xmm1, xmm0 + punpcklbw xmm0, s1 ; 00 10 + punpckhbw xmm1, s1 ; 08 18 + + movdqa xmm2, s2 + movdqa xmm3, xmm2 + punpcklbw xmm2, s3 ; 20 30 + punpckhbw xmm3, s3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s4 + movdqa xmm5, xmm3 + punpcklbw xmm3, s5 ; 40 50 + punpckhbw xmm5, s5 ; 48 58 + + movdqa xmm6, s6 + movdqa xmm7, xmm6 + punpcklbw xmm6, s7 ; 60 70 + punpckhbw xmm7, s7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i0 + punpckhqdq xmm6, i0 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i1 + punpckhqdq xmm9, i1 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i2 + punpckhqdq xmm10, i2 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i3 + punpckhqdq xmm11, i3 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i4 + punpckhqdq xmm12, i4 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i5 + punpckhqdq xmm13, i5 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i6 + punpckhqdq xmm14, i6 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i7 + punpckhqdq xmm15, i7 + + movdqa i0, xmm0 + movdqa i1, xmm6 + movdqa i2, xmm7 + movdqa i3, xmm9 + movdqa i4, xmm4 + movdqa i5, xmm10 + movdqa i6, xmm3 + movdqa i7, xmm11 + movdqa i8, xmm1 + movdqa i9, xmm12 + movdqa i10, xmm8 + movdqa i11, xmm13 + movdqa i12, xmm2 + movdqa i13, xmm14 + movdqa i14, xmm5 + movdqa i15, xmm15 + +; TRANSPOSED DATA AVAILABLE ON THE STACK + + movdqa xmm12, xmm6 + movdqa xmm13, xmm7 + + pxor zero, zero + +LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm8, i4 + movdqa xmm9, i5 +LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm8 + movdqa i5, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm3, i8 + movdqa xmm4, i9 +LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm3 + movdqa i9, xmm4 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm8, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm4, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm4 + movdqa i13, xmm8 + + +; RESHUFFLE AND WRITE OUT + ; 8-f + movdqa xmm0, i8 + movdqa xmm1, xmm0 + punpcklbw xmm0, i9 ; 80 90 + punpckhbw xmm1, i9 ; 88 98 + + movdqa xmm2, i10 + movdqa xmm3, xmm2 + punpcklbw xmm2, i11 ; a0 b0 + punpckhbw xmm3, i11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i12 + movdqa xmm5, xmm3 + punpcklbw xmm3, i13 ; c0 d0 + punpckhbw xmm5, i13 ; c8 d8 + + movdqa xmm6, i14 + movdqa xmm7, xmm6 + punpcklbw xmm6, i15 ; e0 f0 + punpckhbw xmm7, i15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i8, xmm0 + movdqa i9, xmm7 + movdqa i10, xmm4 + movdqa i11, xmm3 + movdqa i12, xmm1 + movdqa i13, xmm8 + movdqa i14, xmm2 + movdqa i15, xmm5 + + ; 0-7 + movdqa xmm0, i0 + movdqa xmm1, xmm0 + punpcklbw xmm0, i1 ; 00 10 + punpckhbw xmm1, i1 ; 08 18 + + movdqa xmm2, i2 + movdqa xmm3, xmm2 + punpcklbw xmm2, i3 ; 20 30 + punpckhbw xmm3, i3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i4 + movdqa xmm5, xmm3 + punpcklbw xmm3, i5 ; 40 50 + punpckhbw xmm5, i5 ; 48 58 + + movdqa xmm6, i6 + movdqa xmm7, xmm6 + punpcklbw xmm6, i7 ; 60 70 + punpckhbw xmm7, i7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i8 + punpckhqdq xmm6, i8 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i9 + punpckhqdq xmm9, i9 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i10 + punpckhqdq xmm10, i10 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i11 + punpckhqdq xmm11, i11 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i12 + punpckhqdq xmm12, i12 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i13 + punpckhqdq xmm13, i13 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i14 + punpckhqdq xmm14, i14 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i15 + punpckhqdq xmm15, i15 + + movdqa s0, xmm0 + movdqa s1, xmm6 + movdqa s2, xmm7 + movdqa s3, xmm9 + movdqa s4, xmm4 + movdqa s5, xmm10 + movdqa s6, xmm3 + movdqa s7, xmm11 + movdqa s8, xmm1 + movdqa s9, xmm12 + movdqa s10, xmm8 + movdqa s11, xmm13 + movdqa s12, xmm2 + movdqa s13, xmm14 + movdqa s14, xmm5 + movdqa s15, xmm15 + + ; free stack space + add rsp, stack_size + + ; un-ALIGN_STACK + pop rsp + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + +SECTION_RODATA +align 16 +te0: + times 16 db 0xe0 +align 16 +t7f: + times 16 db 0x7f +align 16 +tfe: + times 16 db 0xfe +align 16 +t1f: + times 16 db 0x1f +align 16 +t80: + times 16 db 0x80 +align 16 +t1: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 diff --git a/media/libvpx/vp8/common/x86/loopfilter_mmx.asm b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm new file mode 100644 index 000000000..88a07b9f3 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm @@ -0,0 +1,1753 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; transpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbh: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 + + + ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm3=q1, mm7 = limit + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm5 = p0 + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, mm0 ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + + ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, mm4=hev + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rsi+rax], mm6 + movq [rsi], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdi] + movq mm6, [rsi+rax*2] ; p1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi], mm3 + movq [rsi+rax*2], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + + movq mm6, [rdi+rax*4] + neg rax + movq mm3, [rdi+rax ] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi+rax ], mm3 + neg rax + movq [rdi+rax*4], mm6 + +;EARLY_BREAK_OUT: + neg rax + add rsi,8 + dec rcx + jnz .next8_mbh + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbv: + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ;transpose + movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 + + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + + movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 + + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + lea rdx, srct + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + + movq [rdx+56], mm7 + psubusb mm5, mm7 ; q2-q3 + + + movq [rdx+48], mm6 + psubusb mm7, mm6 ; q3-q2 + + por mm7, mm5; ; mm7=abs (q3-q2) + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + psubusb mm3, mm6 ; q1-q2 + + psubusb mm6, mm5 ; q2-q1 + por mm6, mm3 ; mm6=abs(q2-q1) + + movq [rdx+40], mm5 ; save q1 + movq [rdx+32], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq [rdx], mm0 ; save p3 + movq [rdx+8], mm1 ; save p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+24], mm3 ; save p0 + + movq [rdx+16], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 ; abs(q3-q2) > limit + + psubusb mm0, mm4 ; abs(p3-p2) > limit + psubusb mm1, mm4 ; abs(p2-p1) > limit + + psubusb mm6, mm4 ; abs(q2-q1) > limit + por mm7, mm6 ; or + + por mm0, mm1 ; + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+32] ; mm5=q0 + movq mm7, [rdx+40] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 ; abs(q1 - q0) > thresh + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 ; abs(p1 - p0)> thresh + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + + ; start work on filters + lea rdx, srct + + ; start work on filters + movq mm2, [rdx+16] ; p1 + movq mm7, [rdx+40] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + movq mm6, [rdx+24] ; p0 + movq mm0, [rdx+32] ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rdx+24], mm6 + movq [rdx+32], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdx + 40] + movq mm6, [rdx + 16] ; p1 + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdx + 40], mm3 + movq [rdx + 16], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm6, [rdx+ 8] + movq mm3, [rdx+48] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 + pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 + + ; transpose and write back + movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 + movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 + + punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 + punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 + + movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 + movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 + + punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 + punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 + + movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 + punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 + + punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 + movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 + + punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 + punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 + + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 + + movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 + punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 + + movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 + punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 + + punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 + movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 + + punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 + punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 + + movq [rsi+rax*4], mm0 ; write out + movq [rdi+rax*4], mm6 ; write out + + movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 + punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 + + punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 + movq [rsi+rax*2], mm0 ; write out + + movq [rdi+rax*2], mm5 ; write out + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + + punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 + punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 + + movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 + punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 + + punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 + movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 + + movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 + punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 + + punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 + movq [rsi], mm0 ; write out + + movq [rdi], mm1 ; write out + neg rax + + punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 + punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 + + movq [rsi+rax*2], mm3 + movq [rdi+rax*2], mm4 + + lea rsi, [rsi+rax*8] + dec rcx + + jnz .next8_mbv + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rcx, 2 ; count +.nexts8_h: + mov rdx, arg(2) ;blimit ; get blimit + movq mm3, [rdx] ; + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movq mm1, [rsi+2*rax] ; p1 + movq mm0, [rdi] ; q1 + movq mm2, mm1 + movq mm7, mm0 + movq mm4, mm0 + psubusb mm0, mm1 ; q1-=p1 + psubusb mm1, mm4 ; p1-=q1 + por mm1, mm0 ; abs(p1-q1) + pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm1, 1 ; abs(p1-q1)/2 + + movq mm5, [rsi+rax] ; p0 + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + movq mm6, mm5 ; p0 + psubusb mm5, mm4 ; p0-=q0 + psubusb mm4, mm6 ; q0-=p0 + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm3, mm3 + pcmpeqb mm5, mm3 + + ; start work on filters + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) + pand mm5, mm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + movq mm1, mm5 ; get a copy of filters + psraw mm1, 11 ; arithmetic shift right 11 + psllw mm1, 8 ; shift left 8 to put it back + + por mm0, mm1 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .nexts8_h + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4- 2]; ; + mov rcx, 2 ; count +.nexts8_v: + + lea rdi, [rsi + rax]; + movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 + + movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + + movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 + movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 + + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + movq mm5, mm4 ; 53 43 52 42 51 41 50 40 + + punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 + punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 + + neg rax + + movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 + movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 + + punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 + movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 + + movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 + punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 + + movq mm2, mm0 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 13 03 12 02 11 01 10 00 + + punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 + movq mm3, mm2 ; 33 23 13 03 32 22 12 02 + + punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 + punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 + + punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 + + + ; calculate mask + movq mm6, mm0 ; p1 + movq mm7, mm3 ; q1 + psubusb mm7, mm6 ; q1-=p1 + psubusb mm6, mm3 ; p1-=q1 + por mm6, mm7 ; abs(p1-q1) + pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm6, 1 ; abs(p1-q1)/2 + + movq mm5, mm1 ; p0 + movq mm4, mm2 ; q0 + + psubusb mm5, mm2 ; p0-=q0 + psubusb mm4, mm1 ; q0-=p0 + + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] + + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm7, mm7 + pcmpeqb mm5, mm7 ; mm5 = mask + + ; start work on filters + movq t0, mm0 + movq t1, mm3 + + pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm0, mm3 ; p1 - q1 + movq mm6, mm1 ; p0 + + movq mm7, mm2 ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + + pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm7 ; offseted ; q0 + + psubsb mm7, mm6 ; q0 - p0 + paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) + + pand mm5, mm0 ; mask filter values we don't care about + + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + movq mm7, mm5 ; get a copy of filters + psraw mm7, 11 ; arithmetic shift right 11 + psllw mm7, 8 ; shift left 8 to put it back + + por mm0, mm7 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0sz add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + + movq mm0, t0 + movq mm4, t1 + + ; mm0 = 70 60 50 40 30 20 10 00 + ; mm6 = 71 61 51 41 31 21 11 01 + ; mm3 = 72 62 52 42 32 22 12 02 + ; mm4 = 73 63 53 43 33 23 13 03 + ; transpose back to write out + + movq mm1, mm0 ; + punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 + + punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 + movq mm2, mm3 ; + + punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 + movq mm5, mm1 ; 71 70 61 60 51 50 41 40 + + punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 + movq mm6, mm0 ; 31 30 21 20 11 10 01 00 + + punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 + punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 + + movd [rsi+rax*4], mm0 ; write 03 02 01 00 + punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 + + psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 + punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 + + movd [rdi+rax*4], mm0 ; write 13 12 11 10 + movd [rsi+rax*2], mm6 ; write 23 22 21 20 + + psrlq mm6, 32 ; 33 32 31 30 + movd [rsi], mm1 ; write 43 42 41 40 + + movd [rsi + rax], mm6 ; write 33 32 31 30 + neg rax + + movd [rsi + rax*2], mm5 ; write 63 62 61 60 + psrlq mm1, 32 ; 53 52 51 50 + + movd [rdi], mm1 ; write out 53 52 51 50 + psrlq mm5, 32 ; 73 72 71 70 + + movd [rdi + rax*2], mm5 ; write 73 72 71 70 + + lea rsi, [rsi+rax*8] ; next 8 + + dec rcx + jnz .nexts8_v + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, +; int y_stride, +; loop_filter_info *lfi) +;{ +; +; +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +;} + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/media/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm new file mode 100644 index 000000000..1913abc69 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -0,0 +1,1640 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +%define _t0 0 +%define _t1 _t0 + 16 +%define _p3 _t1 + 16 +%define _p2 _p3 + 16 +%define _p1 _p2 + 16 +%define _p0 _p1 + 16 +%define _q0 _p0 + 16 +%define _q1 _q0 + 16 +%define _q2 _q1 + 16 +%define _q3 _q2 + 16 +%define lf_var_size 160 + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa [rsp+_q2], xmm1 ; store q2 + movdqa [rsp+_q1], xmm4 ; store q1 +%endif + movdqa xmm7, [rdx] ;limit + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa [rsp+_t0], xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa [rsp+_p2], xmm4 ; store p2 + movdqa [rsp+_p1], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, [rsp+_q1] ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa [rsp+_t1], xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, [rdx] ; blimit + mov rdx, arg(4) ; hev get thresh + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) + movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, xmm3 ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, xmm3 ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + + paddsb xmm6, xmm2 ; p0+= p0 add + + movdqa xmm2, [GLOBAL(ones)] + paddsw xmm5, xmm2 + paddsw xmm1, xmm2 + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm2, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_p1] ; p1 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rsp+_p1] ; p1 +%endif + + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, xmm2 ; unoffset + + pxor xmm1, xmm2 ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, xmm2 ; unoffset + + pxor xmm1, xmm2 ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, xmm2 ; unoffset +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rax], xmm1 ; p1 + movhps [rdi + rax], xmm1 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + movq [rsi + rcx*2], xmm7 ; q1 + movhps [rdi + rcx*2], xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro MB_FILTER_AND_WRITEBACK 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 + + mov rcx, rax + neg rcx +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 ; vp8_filter + + pand xmm2, xmm4 ; Filter2 = vp8_filter & hev + pxor xmm0, xmm0 + + pandn xmm4, xmm1 ; vp8_filter&=~hev + pxor xmm1, xmm1 + + punpcklbw xmm0, xmm4 ; Filter 2 (hi) + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + + movdqa xmm5, xmm2 + + movdqa xmm4, [GLOBAL(s9)] + paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) + paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + + pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 + pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 + + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx + + psraw xmm7, 11 ; sign extended shift right by 3 + + psraw xmm5, 11 ; sign extended shift right by 3 + punpckhbw xmm4, xmm2 ; axbxcxdx + + punpcklbw xmm2, xmm2 ; exfxgxhx + psraw xmm4, 11 ; sign extended shift right by 3 + + packsswb xmm5, xmm7 ; Filter2 >>=3; + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm4 ; Filter1 >>=3; + + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 + movdqa xmm7, xmm1 + + movdqa xmm4, [GLOBAL(s63)] + movdqa xmm5, xmm0 + movdqa xmm2, xmm5 + paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 + paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 + movdqa xmm4, xmm7 + + paddw xmm5, xmm5 ; Filter 2 (hi) * 18 + + paddw xmm7, xmm7 ; Filter 2 (lo) * 18 + paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 + + paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 + paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 + psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 + + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 + psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 + psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 + + packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 + psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 + psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 + + packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + movdqa xmm7, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_q1] ; q1 + movdqa xmm4, [rsp+_p1] ; p1 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] + +%elif %1 == 1 + movdqa xmm1, [rdi] ; q1 + movdqa xmm4, [rsi+rax*2] ; p1 +%elif %1 == 2 + movdqa xmm4, [rsp+_p1] ; p1 + movdqa xmm1, [rsp+_q1] ; q1 +%endif + + pxor xmm1, xmm7 + pxor xmm4, xmm7 + + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) + paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) + +%if %1 == 1 + movdqa xmm2, [rdi+rax*4] ; p2 + movdqa xmm5, [rdi+rcx] ; q2 +%else + movdqa xmm2, [rsp+_p2] ; p2 + movdqa xmm5, [rsp+_q2] ; q2 +%endif + + pxor xmm1, xmm7 ; *oq1 = sq^0x80; + pxor xmm4, xmm7 ; *op1 = sp^0x80; + pxor xmm2, xmm7 + pxor xmm5, xmm7 + paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) + pxor xmm2, xmm7 ; *op2 = sp^0x80; + pxor xmm5, xmm7 ; *oq2 = sq^0x80; + pxor xmm3, xmm7 ; *oq0 = sq^0x80 + pxor xmm6, xmm7 ; *oq0 = sp^0x80 +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + lea rdx, [rcx + rcx*2] + movq [rsi+rcx*2], xmm1 ; q1 + movhps [rdi+rcx*2], xmm1 + + movq [rsi + rax], xmm4 ; p1 + movhps [rdi + rax], xmm4 + + movq [rsi+rax*2], xmm2 ; p2 + movhps [rdi+rax*2], xmm2 + + movq [rsi+rdx], xmm5 ; q2 + movhps [rdi+rdx], xmm5 +%elif %1 == 1 + movdqa [rdi+rcx], xmm5 ; q2 + movdqa [rdi], xmm1 ; q1 + movdqa [rsi], xmm3 ; q0 + movdqa [rsi+rax ], xmm6 ; p0 + movdqa [rsi+rax*2], xmm4 ; p1 + movdqa [rdi+rax*4], xmm2 ; p2 +%elif %1 == 2 + movdqa [rsp+_p1], xmm4 ; p1 + movdqa [rsp+_p0], xmm6 ; p0 + movdqa [rsp+_q0], xmm3 ; q0 + movdqa [rsp+_q1], xmm1 ; q1 +%endif + +%endmacro + + +;void vp8_mbloop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + +%if %1 == 0 + lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing + lea rsi, [rsi - 4] +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa [rsp+_t0], xmm2 ; save to free XMM2 + + movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 + +%if %2 == 0 + movdqa [rsp+_q3], xmm7 ; save 7 + movdqa [rsp+_q2], xmm6 ; save 6 +%endif + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa [rsp+_p1], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + movdqa [rsp+_p0], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rsp+_q0], xmm4 ; save 4 + movdqa [rsp+_q1], xmm5 ; save 5 + movdqa xmm1, [rsp+_t0] + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + +%if %2 == 0 + movdqa [rsp+_p2], xmm1 + movdqa [rsp+_p3], xmm2 +%endif + +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 0 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) + + movdqa xmm5, [rsp+_p1] ; p1 + pmaxub xmm0, xmm7 + + movdqa xmm2, xmm5 ; p1 + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + + por xmm2, xmm7 ; abs(p1-p0) + + pmaxub xmm0, xmm2 + + movdqa xmm5, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 + + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm4, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + pmaxub xmm0, xmm7 + + psubusb xmm0, [rdx] ; limit + + mov rdx, arg(2) ; blimit + movdqa xmm5, xmm4 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + movdqa xmm4, [rdx] ; blimit + mov rdx, arg(4) ; get thresh + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + por xmm1, xmm6 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + movdqa xmm3, [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh + + psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm2, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm2 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + movd [rsi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2], %1 + movd [rdi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rsi+2*rax+2], %1 + movd [rsi+2*rcx+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2*rax+2], %1 + movd [rdi+2*rcx+2], %2 +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%macro MBV_TRANSPOSE 0 + movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + + movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + + punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 +%endmacro + +%macro MBV_WRITEBACK_1 0 + movq [rsi], xmm0 + movhps [rdi], xmm0 + + movq [rsi+2*rax], xmm6 + movhps [rdi+2*rax], xmm6 + + movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + + movq [rsi+4*rax], xmm0 + movhps [rdi+4*rax], xmm0 + + movq [rsi+2*rcx], xmm3 + movhps [rdi+2*rcx], xmm3 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + + movdqa xmm0, xmm7 + punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + + movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 + punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 +%endmacro + +%macro MBV_WRITEBACK_2 0 + movq [rsi], xmm1 + movhps [rdi], xmm1 + + movq [rsi+2*rax], xmm5 + movhps [rdi+2*rax], xmm5 + + movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + + movq [rsi+4*rax], xmm1 + movhps [rdi+4*rax], xmm1 + + movq [rsi+2*rcx], xmm4 + movhps [rdi+2*rcx], xmm4 +%endmacro + + +;void vp8_mbloop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ; Transpose + TRANSPOSE_16X8 1, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + neg rax + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + ; transpose and write back + MBV_TRANSPOSE + + neg rax + + MBV_WRITEBACK_1 + + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ; Transpose + TRANSPOSE_16X8 0, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + ; transpose and write back + MBV_TRANSPOSE + + mov rsi, arg(0) ;u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_1 + mov rsi, arg(5) ;v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + ; end prolog + + mov rcx, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + movdqa xmm6, [GLOBAL(tfe)] + lea rdx, [rcx + rax] + neg rax + + ; calculate mask + movdqa xmm0, [rdx] ; q1 + mov rdx, arg(2) ;blimit + movdqa xmm1, [rcx+2*rax] ; p1 + + movdqa xmm2, xmm1 + movdqa xmm3, xmm0 + + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm3 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, xmm6 ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqa xmm7, XMMWORD PTR [rdx] + + movdqa xmm5, [rcx+rax] ; p0 + movdqa xmm4, [rcx] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + + movdqa xmm4, [GLOBAL(t80)] + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 + + + ; start work on filters + pxor xmm2, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm2, xmm3 ; p1 - q1 + + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm0, xmm4 ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset + movdqa [rcx], xmm3 ; write back + + pxor xmm6, xmm4 ; unoffset + movdqa [rcx+rax], xmm6 ; write back + + ; begin epilog + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm7, xmm4 + punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + mov rdx, arg(2) ;blimit + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm7, [rdx] + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm4, [GLOBAL(t80)] + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm0, xmm3 ; p1 - q1 + + pxor xmm1, xmm4 ; offset to convert to signed values + pxor xmm2, xmm4 ; offset to convert to signed values + + movdqa xmm3, xmm2 ; offseted ; q0 + psubsb xmm2, xmm1 ; q0 - p0 + paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm0 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm6, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm1, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset q0 + pxor xmm1, xmm4 ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm6, xmm0 + punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm6 + punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movd [rsi], xmm6 ; write the second 8-line result + movd [rdx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi], xmm6 + movd [rcx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rsi + rax*2], xmm6 + movd [rdx + rax*2], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi + rax*2], xmm6 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + movd [rdx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi], xmm0 + movd [rcx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rsi + rax*2], xmm0 + movd [rdx + rax*2], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi + rax*2], xmm0 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f +align 16 +te0: + times 16 db 0xe0 +align 16 +t1f: + times 16 db 0x1f diff --git a/media/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/vp8/common/x86/loopfilter_x86.c new file mode 100644 index 000000000..658600460 --- /dev/null +++ b/media/libvpx/vp8/common/x86/loopfilter_x86.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8/common/loopfilter.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_nc(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh) + +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); +prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); + +#if HAVE_SSE2 && ARCH_X86_64 +prototype_loopfilter(vp8_loop_filter_bv_y_sse2); +prototype_loopfilter(vp8_loop_filter_bh_y_sse2); +#else +prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); +#endif +prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; + +#if HAVE_MMX +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); +} +#endif + + +/* Horizontal MB filtering */ +#if HAVE_SSE2 +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); +} + + +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); +} + + +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); +} + +#endif diff --git a/media/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/vp8/common/x86/mfqe_sse2.asm new file mode 100644 index 000000000..a8a7f568d --- /dev/null +++ b/media/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -0,0 +1,287 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight16x16_sse2) PRIVATE +sym(vp8_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight8x8_sse2) PRIVATE +sym(vp8_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE +sym(vp8_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 + diff --git a/media/libvpx/vp8/common/x86/postproc_mmx.asm b/media/libvpx/vp8/common/x86/postproc_mmx.asm new file mode 100644 index 000000000..a2b16327f --- /dev/null +++ b/media/libvpx/vp8/common/x86/postproc_mmx.asm @@ -0,0 +1,315 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + +;void vp8_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_mmx) PRIVATE +sym(vp8_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp8_rv))] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c<cols; c+=4) +.loop_col: + mov rsi, arg(0) ;s + pxor mm0, mm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + + ; this copies the last row down into the border 8 rows + mov rdi, rsi + mov rdx, arg(2) + sub rdx, 9 + imul rdx, rax + lea rdi, [rdi+rdx] + movq mm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_borderd ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], mm1 + + dec rcx + jne .init_borderd + + neg rax ; rax = -pitch + + ; this copies the first row up into the border 8 rows + mov rdi, rsi + movq mm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_border ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], mm1 + + dec rcx + jne .init_border + + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor mm5, mm5 + pxor mm6, mm6 ; + + pxor mm7, mm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movd mm1, DWORD PTR [rdi]; + punpcklbw mm1, mm0 ; + + paddw mm5, mm1 ; + pmullw mm1, mm1 ; + + movq mm2, mm1 ; + punpcklwd mm1, mm0 ; + + punpckhwd mm2, mm0 ; + paddd mm6, mm1 ; + + paddd mm7, mm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw mm1, mm0 + punpcklbw mm2, mm0 + + paddw mm5, mm2 + psubw mm5, mm1 + + pmullw mm2, mm2 + movq mm4, mm2 + + punpcklwd mm2, mm0 + punpckhwd mm4, mm0 + + paddd mm6, mm2 + paddd mm7, mm4 + + pmullw mm1, mm1 + movq mm2, mm1 + + punpcklwd mm1, mm0 + psubd mm6, mm1 + + punpckhwd mm2, mm0 + psubd mm7, mm2 + + + movq mm3, mm6 + pslld mm3, 4 + + psubd mm3, mm6 + movq mm1, mm5 + + movq mm4, mm5 + pmullw mm1, mm1 + + pmulhw mm4, mm4 + movq mm2, mm1 + + punpcklwd mm1, mm4 + punpckhwd mm2, mm4 + + movq mm4, mm7 + pslld mm4, 4 + + psubd mm4, mm7 + + psubd mm3, mm1 + psubd mm4, mm2 + + psubd mm3, flimit2 + psubd mm4, flimit2 + + psrad mm3, 31 + psrad mm4, 31 + + packssdw mm3, mm4 + packsswb mm3, mm0 + + movd mm1, DWORD PTR [rsi+rax*8] + + movq mm2, mm1 + punpcklbw mm1, mm0 + + paddw mm1, mm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp8_rv))] + movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movq mm4, [sym(vp8_rv) + rcx*2] +%endif + paddw mm1, mm4 + psraw mm1, 4 + + packuswb mm1, mm0 + pand mm1, mm3 + + pandn mm3, mm2 + por mm1, mm3 + + and rcx, 15 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] + + cmp edx, 8 + jl .skip_assignment + + mov rcx, rdx + sub rcx, 8 + and rcx, 15 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] + movd [rsi], mm1 + +.skip_assignment + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + + add dword arg(0), 4 ; s += 4 + sub dword arg(3), 4 ; cols -= 4 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 136 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit2 + + +;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vp8_plane_add_noise_mmx) PRIVATE +sym(vp8_plane_add_noise_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movq mm1,[rsi+rax] ; get the source + + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb mm1, [rdx+32] ;bothclamp + psubusb mm1, [rdx+16] ;whiteclamp + + movq mm2,[rdi+rax] ; get the noise for this line + paddb mm1,mm2 ; add it in + movq [rsi+rax],mm1 ; store the result + + add rax,8 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +Blur: + times 16 dw 16 + times 8 dw 64 + times 16 dw 16 + times 8 dw 0 + +rd: + times 4 dw 0x40 diff --git a/media/libvpx/vp8/common/x86/postproc_sse2.asm b/media/libvpx/vp8/common/x86/postproc_sse2.asm new file mode 100644 index 000000000..fed4ee5cc --- /dev/null +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm @@ -0,0 +1,723 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;macro in deblock functions +%macro FIRST_2_ROWS 0 + movdqa xmm4, xmm0 + movdqa xmm6, xmm0 + movdqa xmm5, xmm1 + pavgb xmm5, xmm3 + + ;calculate absolute value + psubusb xmm4, xmm1 + psubusb xmm1, xmm0 + psubusb xmm6, xmm3 + psubusb xmm3, xmm0 + paddusb xmm4, xmm1 + paddusb xmm6, xmm3 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm7, xmm2 + + ;get mask + psubusb xmm2, xmm4 + psubusb xmm7, xmm6 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm7, xmm1 + por xmm7, xmm2 +%endmacro + +%macro SECOND_2_ROWS 0 + movdqa xmm6, xmm0 + movdqa xmm4, xmm0 + movdqa xmm2, xmm1 + pavgb xmm1, xmm3 + + ;calculate absolute value + psubusb xmm6, xmm2 + psubusb xmm2, xmm0 + psubusb xmm4, xmm3 + psubusb xmm3, xmm0 + paddusb xmm6, xmm2 + paddusb xmm4, xmm3 + + pavgb xmm5, xmm1 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm3, xmm2 + + ;get mask + psubusb xmm2, xmm6 + psubusb xmm3, xmm4 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm3, xmm1 + + por xmm7, xmm2 + por xmm7, xmm3 + + pavgb xmm5, xmm0 + + ;decide if or not to use filtered value + pand xmm0, xmm7 + pandn xmm7, xmm5 + paddusb xmm0, xmm7 +%endmacro + +%macro UPDATE_FLIMIT 0 + movdqa xmm2, XMMWORD PTR [rbx] + movdqa [rsp], xmm2 + add rbx, 16 +%endmacro + +;void vp8_post_proc_down_and_across_mb_row_sse2 +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int cols, +; int *flimits, +; int size +;) +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE +sym(vp8_post_proc_down_and_across_mb_row_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 + + ; put flimit on stack + mov rbx, arg(5) ;flimits ptr + UPDATE_FLIMIT + +%define flimit [rsp] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock +.nextrow: + xor rdx, rdx ;col +.nextcol: + ;load current and next 2 rows + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] + + FIRST_2_ROWS + + ;load above 2 rows + neg rax + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] + movdqu xmm3, XMMWORD PTR [rsi + rax] + + SECOND_2_ROWS + + movdqu XMMWORD PTR [rdi], xmm0 + + neg rax ; positive stride + add rsi, 16 + add rdi, 16 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .downdone + UPDATE_FLIMIT + jmp .nextcol + +.downdone: + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + mov rbx, arg(5) ; flimits + UPDATE_FLIMIT + + ; dup the first byte into the left border 8 times + movq mm1, [rdi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + mov rdx, -8 + movq [rdi+rdx], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(4) + movq mm1, [rdi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rdi+rdx], mm1 + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-16]; + movq mm1, QWORD PTR [rdi-8]; + +.acrossnextcol: + movdqu xmm0, XMMWORD PTR [rdi + rdx] + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] + + FIRST_2_ROWS + + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] + + SECOND_2_ROWS + + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes + movdq2q mm0, xmm0 + psrldq xmm0, 8 + movdq2q mm1, xmm0 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .acrossdone + UPDATE_FLIMIT + jmp .acrossnextcol + +.acrossdone + ; last 16 pixels + movq QWORD PTR [rdi+rdx-16], mm0 + + cmp edx, dword arg(4) + jne .throw_last_8 + movq QWORD PTR [rdi+rdx-8], mm1 +.throw_last_8: + ; done with this rwo + add rsi,rax ;next src line + mov eax, dword arg(3) ;dst_pixels_per_line + add rdi,rax ;next destination + mov eax, dword arg(2) ;src_pixels_per_line + + mov rbx, arg(5) ;flimits + UPDATE_FLIMIT + + dec rcx ;decrement count + jnz .nextrow ;next row + + add rsp, 16 + pop rsp + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit + +;void vp8_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp8_rv) +global sym(vp8_mbpost_proc_down_xmm) PRIVATE +sym(vp8_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp8_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c<cols; c+=8) +.loop_col: + mov rsi, arg(0) ; s + pxor xmm0, xmm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + + ; this copies the last row down into the border 8 rows + mov rdi, rsi + mov rdx, arg(2) + sub rdx, 9 + imul rdx, rax + lea rdi, [rdi+rdx] + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_borderd ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_borderd + + neg rax ; rax = -pitch + + ; this copies the first row up into the border 8 rows + mov rdi, rsi + movq xmm1, QWORD ptr[rdi] ; first row + mov rcx, 8 +.init_border ; initialize borders + lea rdi, [rdi + rax] + movq [rdi], xmm1 + + dec rcx + jne .init_border + + + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movq xmm1, QWORD PTR [rdi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [rsi+rax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp8_rv))] + movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] +%else + movdqu xmm4, [sym(vp8_rv) + rcx*2] +%endif + + paddw xmm1, xmm4 + ;paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and rcx, 15 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + + cmp edx, 8 + jl .skip_assignment + + mov rcx, rdx + sub rcx, 8 + and rcx, 15 + movq mm0, [rsp + rcx*8] ;d[rcx*8] + movq [rsi], mm0 + +.skip_assignment + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + add dword arg(0), 8 ; s += 8 + sub dword arg(3), 8 ; cols -= 8 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 128+16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE +sym(vp8_mbpost_proc_across_ip_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r<rows;r++) +.ip_row_loop: + + xor rdx, rdx ;sumsq=0; + xor rcx, rcx ;sum=0; + mov rsi, arg(0); s + + + ; dup the first byte into the left border 8 times + movq mm1, [rsi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + + mov rdi, -8 + movq [rsi+rdi], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(3) + movq mm1, [rsi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rsi+rdx], mm1 + +.ip_var_loop: + ;for(i=-8;i<=6;i++) + ;{ + ; sumsq += s[i]*s[i]; + ; sum += s[i]; + ;} + movzx eax, byte [rsi+rdi] + add ecx, eax + mul al + add edx, eax + add rdi, 1 + cmp rdi, 6 + jle .ip_var_loop + + + ;mov rax, sumsq + ;movd xmm7, rax + movd xmm7, edx + + ;mov rax, sum + ;movd xmm6, rax + movd xmm6, ecx + + mov rsi, arg(0) ;s + xor rcx, rcx + + movsxd rdx, dword arg(3) ;cols + add rdx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 +.nextcol4: + + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 ; expanding + punpcklbw xmm2, xmm0 ; expanding + + punpcklwd xmm1, xmm0 ; expanding to dwords + punpcklwd xmm2, xmm0 ; expanding to dwords + + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 ; duplicate the last ones + pshufd xmm7, xmm7, 0 ; duplicate the last ones + + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [rsi+rcx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, [GLOBAL(four8s)] + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [rsi+rcx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add rcx, 4 + + cmp rcx, rdx + jl .nextcol4 + + ;s+=pitch; + movsxd rax, dword arg(1) + add arg(0), rax + + sub dword arg(2), 1 ;rows-=1 + cmp dword arg(2), 0 + jg .ip_row_loop + + add rsp, 16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vp8_plane_add_noise_wmt) PRIVATE +sym(vp8_plane_add_noise_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +four8s: + times 4 dd 8 diff --git a/media/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/vp8/common/x86/recon_mmx.asm new file mode 100644 index 000000000..15e98713c --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_mmx.asm @@ -0,0 +1,274 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x8_mmx) PRIVATE +sym(vp8_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x4_mmx) PRIVATE +sym(vp8_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_mmx) PRIVATE +sym(vp8_copy_mem16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movsxd rax, dword ptr arg(1) ;src_stride; + + mov rdi, arg(2) ;dst; + movsxd rcx, dword ptr arg(3) ;dst_stride + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq [rdi], mm0 + movq [rdi+8], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/vp8/common/x86/recon_sse2.asm new file mode 100644 index 000000000..7141f8324 --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_sse2.asm @@ -0,0 +1,1115 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_sse2) PRIVATE +sym(vp8_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_intra_pred_uv_dc_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dc_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ; from top + mov rdi, arg(2) ;above; + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + pxor mm0, mm0 + movq mm1, [rdi] + lea rdi, [rax*3] + psadbw mm1, mm0 + ; from left + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax*1] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + + movzx edx, byte [rsi+rdi] + lea rsi, [rsi+rax*4] + add ecx, edx + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + + ; add up + pextrw edx, mm1, 0x0 + lea edx, [edx+ecx+8] + sar edx, 4 + movd mm1, edx + movsxd rcx, dword ptr arg(1) ;dst_stride + pshufw mm1, mm1, 0x0 + mov rdi, arg(0) ;dst; + packuswb mm1, mm1 + + ; write out + lea rax, [rcx*3] + lea rdx, [rdi+rcx*4] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + movq [rdx ], mm1 + movq [rdx+rcx ], mm1 + movq [rdx+rcx*2], mm1 + movq [rdx+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dctop_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dctop_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;arg(3), arg(4) not used + + ; from top + mov rsi, arg(2) ;above; + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; add up + paddw mm1, [GLOBAL(dc_4)] + psraw mm1, 3 + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dcleft_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE +sym(vp8_intra_pred_uv_dcleft_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;arg(2) not used + + ; from left + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+4] + + ; add up + shr edx, 3 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dc128_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE +sym(vp8_intra_pred_uv_dc128_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + ;arg(2), arg(3), arg(4) not used + + ; write out + movq mm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +%macro vp8_intra_pred_uv_tm 1 +global sym(vp8_intra_pred_uv_tm_%1) PRIVATE +sym(vp8_intra_pred_uv_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + ; read top row + mov edx, 4 + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm2, [GLOBAL(dc_1024)] +%endif + movq xmm1, [rsi] + punpcklbw xmm1, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm3, [rsi-1] + mov rsi, arg(3) ;left; +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + pshuflw xmm3, xmm3, 0x0 + punpcklqdq xmm3, xmm3 +%else + pshufb xmm3, xmm2 +%endif + psubw xmm1, xmm3 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +.vp8_intra_pred_uv_tm_%1_loop: + mov bl, [rsi] + movd xmm3, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm3, xmm3, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm3, xmm3 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm3, xmm2 + pshufb xmm5, xmm2 +%endif + paddw xmm3, xmm1 + paddw xmm5, xmm1 + packuswb xmm3, xmm5 + movq [rdi ], xmm3 + movhps[rdi+rcx], xmm3 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp8_intra_pred_uv_tm_%1_loop + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_uv_tm sse2 +vp8_intra_pred_uv_tm ssse3 + +;void vp8_intra_pred_uv_ve_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE +sym(vp8_intra_pred_uv_ve_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + ; end prolog + + ; arg(3), arg(4) not used + + ; read from top + mov rax, arg(2) ;src; + + movq mm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_ho_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +%macro vp8_intra_pred_uv_ho 1 +global sym(vp8_intra_pred_uv_ho_%1) PRIVATE +sym(vp8_intra_pred_uv_ho_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx +%ifidn %1, ssse3 + GET_GOT rbx +%endif + ; end prolog + + ;arg(2) not used + + ; read from left and write out +%ifidn %1, mmx2 + mov edx, 4 +%endif + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride +%ifidn %1, ssse3 + lea rdx, [rcx*3] + movdqa xmm2, [GLOBAL(dc_00001111)] +%endif + +%ifidn %1, mmx2 +.vp8_intra_pred_uv_ho_%1_loop: + mov bl, [rsi] + movd mm0, ebx + + mov bl, [rsi+rax] + movd mm1, ebx + + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + pshufw mm0, mm0, 0x0 + pshufw mm1, mm1, 0x0 + movq [rdi ], mm0 + movq [rdi+rcx], mm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp8_intra_pred_uv_ho_%1_loop +%else + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 + lea rsi, [rsi+rax*4] + lea rdi, [rdi+rcx*4] + + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 +%endif + + ; begin epilog +%ifidn %1, ssse3 + RESTORE_GOT +%endif + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_uv_ho mmx2 +vp8_intra_pred_uv_ho ssse3 + +;void vp8_intra_pred_y_dc_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dc_sse2) PRIVATE +sym(vp8_intra_pred_y_dc_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ; from top + mov rdi, arg(2) ;above + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + + pxor xmm0, xmm0 + movdqa xmm1, [rdi] + psadbw xmm1, xmm0 + movq xmm2, xmm1 + punpckhqdq xmm1, xmm1 + paddw xmm1, xmm2 + + ; from left + lea rdi, [rax*3] + + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + + ; add up + pextrw edx, xmm1, 0x0 + lea edx, [edx+ecx+16] + sar edx, 5 + movd xmm1, edx + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + dec rsi + jnz .label + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dctop_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE +sym(vp8_intra_pred_y_dctop_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + GET_GOT rbx + ; end prolog + + ;arg(3), arg(4) not used + + ; from top + mov rcx, arg(2) ;above; + pxor xmm0, xmm0 + movdqa xmm1, [rcx] + psadbw xmm1, xmm0 + movdqa xmm2, xmm1 + punpckhqdq xmm1, xmm1 + paddw xmm1, xmm2 + + ; add up + paddw xmm1, [GLOBAL(dc_8)] + psraw xmm1, 4 + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdx, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdx ], xmm1 + movdqa [rdx+rcx ], xmm1 + movdqa [rdx+rcx*2], xmm1 + movdqa [rdx+rax ], xmm1 + lea rdx, [rdx+rcx*4] + movdqa [rdx ], xmm1 + movdqa [rdx+rcx ], xmm1 + movdqa [rdx+rcx*2], xmm1 + movdqa [rdx+rax ], xmm1 + lea rdx, [rdx+rcx*4] + dec rsi + jnz .label + + ; begin epilog + RESTORE_GOT + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dcleft_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE +sym(vp8_intra_pred_y_dcleft_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;arg(2) not used + + ; from left + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+8] + + ; add up + shr edx, 4 + movd xmm1, edx + ; FIXME use pshufb for ssse3 version + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm1, xmm1 + packuswb xmm1, xmm1 + + ; write out + mov rsi, 2 + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + +.label + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + movdqa [rdi ], xmm1 + movdqa [rdi+rcx ], xmm1 + movdqa [rdi+rcx*2], xmm1 + movdqa [rdi+rax ], xmm1 + lea rdi, [rdi+rcx*4] + dec rsi + jnz .label + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_dc128_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE +sym(vp8_intra_pred_y_dc128_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + GET_GOT rbx + ; end prolog + + ;arg(2), arg(3), arg(4) not used + + ; write out + mov rsi, 2 + movdqa xmm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + +.label + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + dec rsi + jnz .label + + ; begin epilog + RESTORE_GOT + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +%macro vp8_intra_pred_y_tm 1 +global sym(vp8_intra_pred_y_tm_%1) PRIVATE +sym(vp8_intra_pred_y_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rbx + GET_GOT rbx + ; end prolog + + ; read top row + mov edx, 8 + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm3, [GLOBAL(dc_1024)] +%endif + movdqa xmm1, [rsi] + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm4, [rsi-1] + mov rsi, arg(3) ;left +%ifidn %1, sse2 + punpcklbw xmm4, xmm0 + pshuflw xmm4, xmm4, 0x0 + punpcklqdq xmm4, xmm4 +%else + pshufb xmm4, xmm3 +%endif + psubw xmm1, xmm4 + psubw xmm2, xmm4 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride +vp8_intra_pred_y_tm_%1_loop: + mov bl, [rsi] + movd xmm4, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx +%ifidn %1, sse2 + punpcklbw xmm4, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm4, xmm4, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm4, xmm4 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm4, xmm3 + pshufb xmm5, xmm3 +%endif + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + paddw xmm4, xmm1 + paddw xmm6, xmm2 + paddw xmm5, xmm1 + paddw xmm7, xmm2 + packuswb xmm4, xmm6 + packuswb xmm5, xmm7 + movdqa [rdi ], xmm4 + movdqa [rdi+rcx], xmm5 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_y_tm_%1_loop + + ; begin epilog + RESTORE_GOT + pop rbx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_y_tm sse2 +vp8_intra_pred_y_tm ssse3 + +;void vp8_intra_pred_y_ve_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride +; ) +global sym(vp8_intra_pred_y_ve_sse2) PRIVATE +sym(vp8_intra_pred_y_ve_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + ; end prolog + + ;arg(3), arg(4) not used + + mov rax, arg(2) ;above; + mov rsi, 2 + movsxd rdx, dword ptr arg(1) ;dst_stride + + ; read from top + movdqa xmm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + lea rcx, [rdx*3] + +.label + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + movdqa [rax ], xmm1 + movdqa [rax+rdx ], xmm1 + movdqa [rax+rdx*2], xmm1 + movdqa [rax+rcx ], xmm1 + lea rax, [rax+rdx*4] + dec rsi + jnz .label + + ; begin epilog + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_y_ho_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *above, +; unsigned char *left, +; int left_stride, +; ) +global sym(vp8_intra_pred_y_ho_sse2) PRIVATE +sym(vp8_intra_pred_y_ho_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + ;arg(2) not used + + ; read from left and write out + mov edx, 8 + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +vp8_intra_pred_y_ho_sse2_loop: + mov bl, [rsi] + movd xmm0, ebx + mov bl, [rsi+rax] + movd xmm1, ebx + + ; FIXME use pshufb for ssse3 version + punpcklbw xmm0, xmm0 + punpcklbw xmm1, xmm1 + pshuflw xmm0, xmm0, 0x0 + pshuflw xmm1, xmm1, 0x0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + movdqa [rdi ], xmm0 + movdqa [rdi+rcx], xmm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_y_ho_sse2_loop + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +dc_128: + times 16 db 128 +dc_4: + times 4 dw 4 +align 16 +dc_8: + times 8 dw 8 +align 16 +dc_1024: + times 8 dw 0x400 +align 16 +dc_00001111: + times 8 db 0 + times 8 db 1 diff --git a/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c new file mode 100644 index 000000000..65f4251a9 --- /dev/null +++ b/media/libvpx/vp8/common/x86/recon_wrapper_sse2.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/blockd.h" + +#define build_intra_predictors_mbuv_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *above, \ + const unsigned char *left, int left_stride) +typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t)); + +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3); + +static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_stride, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + build_intra_predictors_mbuv_fn_t tm_func, + build_intra_predictors_mbuv_fn_t ho_func) +{ + int mode = x->mode_info_context->mbmi.uv_mode; + build_intra_predictors_mbuv_fn_t fn; + + switch (mode) { + case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break; + case H_PRED: fn = ho_func; break; + case TM_PRED: fn = tm_func; break; + case DC_PRED: + if (x->up_available) { + if (x->left_available) { + fn = vp8_intra_pred_uv_dc_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dctop_mmx2; break; + } + } else if (x->left_available) { + fn = vp8_intra_pred_uv_dcleft_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dc128_mmx; break; + } + break; + default: return; + } + + fn(dst_u, dst_stride, uabove_row, uleft, left_stride); + fn(dst_v, dst_stride, vabove_row, vleft, left_stride); +} + +void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, + vp8_intra_pred_uv_tm_sse2, + vp8_intra_pred_uv_ho_mmx2); +} + +void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, + vp8_intra_pred_uv_tm_ssse3, + vp8_intra_pred_uv_ho_ssse3); +} + +#define build_intra_predictors_mby_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *above, \ + const unsigned char *left, int left_stride) +typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t)); + +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3); + +static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char *dst_y, + int dst_stride, + unsigned char * yleft, + int left_stride, + build_intra_predictors_mby_fn_t tm_func) +{ + int mode = x->mode_info_context->mbmi.mode; + build_intra_predictors_mbuv_fn_t fn; + + switch (mode) { + case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break; + case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break; + case TM_PRED: fn = tm_func; break; + case DC_PRED: + if (x->up_available) { + if (x->left_available) { + fn = vp8_intra_pred_y_dc_sse2; break; + } else { + fn = vp8_intra_pred_y_dctop_sse2; break; + } + } else if (x->left_available) { + fn = vp8_intra_pred_y_dcleft_sse2; break; + } else { + fn = vp8_intra_pred_y_dc128_sse2; break; + } + break; + default: return; + } + + fn(dst_y, dst_stride, yabove_row, yleft, left_stride); + return; +} + +void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, + vp8_intra_pred_y_tm_sse2); +} + +void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, + vp8_intra_pred_y_tm_ssse3); + +} diff --git a/media/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 000000000..47dd45229 --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,702 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1d_h6_mmx) PRIVATE +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1dc_v6_mmx) PRIVATE +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x8_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_mmx) PRIVATE +sym(vp8_bilinear_predict8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + shl rax, 5 ; offset * 32 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + + shl rax, 5 ; offset*32 + add rax, rcx ; VFilter + + lea rcx, [rdi+rdx*8] ; + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x8: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 ;dst_pitch +%endif + cmp rdi, rcx ; + jne .next_row_8x8 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x4_mmx) PRIVATE +sym(vp8_bilinear_predict8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + mov rsi, arg(0) ;src_ptr ; + add rax, rcx + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x4: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 +%endif + cmp rdi, rcx ; + jne .next_row_8x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict4x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict4x4_mmx) PRIVATE +sym(vp8_bilinear_predict4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;ldst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movq mm7, mm3 ; + packuswb mm7, mm0 ; + + add rsi, rdx ; next line +.next_row_4x4: + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + + movq mm5, mm7 ; + punpcklbw mm5, mm0 ; + + pmullw mm5, [rax] ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + movq mm7, mm3 ; + + packuswb mm7, mm0 ; + + pmullw mm3, [rax+16] ; + paddw mm3, mm5 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb mm3, mm0 + movd [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch ; + add rsi, rdx ; next line + add rdi, r8 +%endif + + cmp rdi, rcx ; + jne .next_row_4x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp8_six_tap_mmx)) +sym(vp8_six_tap_mmx): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + diff --git a/media/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/vp8/common/x86/subpixel_sse2.asm new file mode 100644 index 000000000..69f8d103c --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -0,0 +1,1372 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +global sym(vp8_filter_block1d8_h6_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_h6_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d8_v6_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_v6_sse2 +;( +; unsigned short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; const short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_v6_sse2) PRIVATE +sym(vp8_filter_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d16_v6_sse2_loop: +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] + pmullw xmm1, [rax + 16] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm3, [rax + 64] + pmullw xmm4, [rax + 64] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm5, [rax + 32] + pmullw xmm6, [rax + 32] + + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 + movdqa xmm0, XMMWORD PTR [rsi + 16] + pmullw xmm7, [rax] + pmullw xmm0, [rax] + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + paddsw xmm1, xmm7 + paddsw xmm2, xmm0 + + add rsi, rdx + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm3, [rax + 48] + pmullw xmm4, [rax + 48] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm5, [rax + 80] + pmullw xmm6, [rax + 80] + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] + pxor xmm0, xmm0 ; clear xmm0 + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + + paddsw xmm1, xmm7 + paddsw xmm2, xmm7 + + psraw xmm1, 7 + psraw xmm2, 7 + + packuswb xmm1, xmm2 ; pack and saturate + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_only_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + + movq QWORD PTR [rdi], xmm4 ; store the results in the destination + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_only_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_only_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; lower 8 bytes + + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; higher 8 bytes + + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; Second-pass filter only when xoffset==0 +global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + mov rax, arg(5) ;vp8_filter + + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_only_sse2_loop: + movq xmm1, MMWORD PTR [rsi] + movq xmm2, MMWORD PTR [rsi + rdx] + movq xmm3, MMWORD PTR [rsi + rdx * 2] + movq xmm5, MMWORD PTR [rsi + rdx * 4] + add rsi, rdx + movq xmm4, MMWORD PTR [rsi + rdx * 2] + movq xmm6, MMWORD PTR [rsi + rdx * 4] + + punpcklbw xmm1, xmm0 + pmullw xmm1, [rax] + + punpcklbw xmm2, xmm0 + pmullw xmm2, [rax + 16] + + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax + 32] + + punpcklbw xmm5, xmm0 + pmullw xmm5, [rax + 64] + + punpcklbw xmm4, xmm0 + pmullw xmm4, [rax + 48] + + punpcklbw xmm6, xmm0 + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE +sym(vp8_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +.unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz .unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict16x16_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp8_bilinear_filters_x86_8) +global sym(vp8_bilinear_predict16x16_sse2) PRIVATE +sym(vp8_bilinear_predict16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + movsxd rax, dword ptr arg(2) ;xoffset + + cmp rax, 0 ;skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + + cmp rax, 0 ;skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;dst_pitch +%endif + ; get the first horizontal line done + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + add rsi, rdx ; next line +.next_row: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, [rax] + pmullw xmm6, [rax] + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + pmullw xmm3, [rax+16] + pmullw xmm4, [rax+16] + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rdx ; next line +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ;dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + add rsi, rax ; next line +.next_row_spo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + movdqa xmm4, xmm3 ; make a copy of current line + movdqa xmm7, xmm3 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm5, xmm1 + pmullw xmm6, xmm1 + pmullw xmm3, xmm2 + pmullw xmm4, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ;dst_pitch + cmp rdi, rcx + jne .next_row_spo + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + pxor xmm0, xmm0 + +.next_row_fpo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ; dst_pitch + cmp rdi, rcx + jne .next_row_fpo + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict8x8_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_sse2) PRIVATE +sym(vp8_bilinear_predict8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ;xoffset + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm5, [rax] + movdqa xmm6, [rax+16] + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqa xmm3, XMMWORD PTR [rsp] + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + add rsp, 16 ; next line +.next_row8x8: + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + pmullw xmm7, xmm5 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm4, xmm3 + + pmullw xmm3, xmm6 + paddw xmm3, xmm7 + + movdqa xmm7, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb xmm3, xmm0 + movq [rdi], xmm3 ; store the results in the destination + + add rsp, 16 ; next line + add rdi, rdx + + cmp rdi, rcx + jne .next_row8x8 + + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/media/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm new file mode 100644 index 000000000..c06f24556 --- /dev/null +++ b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -0,0 +1,1508 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE +sym(vp8_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [GLOBAL(rd)] + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +.filter_block1d8_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + pmaddubsw xmm1, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm1 + paddsw xmm2, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz .filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx + +.filter_block1d8_h4_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 + + pshufb xmm2, xmm4 + pmaddubsw xmm0, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz .filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE +sym(vp8_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d16_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + movq xmm3, MMWORD PTR [rsi + 6] + + pmaddubsw xmm1, xmm5 + movq xmm7, MMWORD PTR [rsi + 11] + + pmaddubsw xmm2, xmm6 + punpcklbw xmm3, xmm7 + + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + + movdqa xmm2, xmm1 + paddsw xmm0, [GLOBAL(rd)] + + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + + psraw xmm0, 7 + pmaddubsw xmm1, xmm5 + + pmaddubsw xmm2, xmm6 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + + paddsw xmm3, xmm2 + + paddsw xmm3, [GLOBAL(rd)] + + psraw xmm3, 7 + + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE +sym(vp8_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + movdqa xmm7, [GLOBAL(rd)] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +.filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf1b)] + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2b)] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movd DWORD PTR [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d4_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + psraw xmm1, 7 + packuswb xmm1, xmm1 + + movd DWORD PTR [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp8_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE +sym(vp8_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +.vp8_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +.vp8_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [GLOBAL(rd)] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [GLOBAL(rd)] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE +sym(vp8_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [GLOBAL(rd)] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d4_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE +sym(vp8_filter_block1d4_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_v4_ssse3 + + movq mm5, MMWORD PTR [rax] ;k0_k5 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v6_ssse3_loop: + movd mm1, DWORD PTR [rsi] ;A + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + movd mm0, DWORD PTR [rax + rdx * 4] ;F + + movq mm4, [GLOBAL(rd)] + + pmaddubsw mm3, mm6 + punpcklbw mm1, mm0 ;A F + pmaddubsw mm2, mm7 + pmaddubsw mm1, mm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm1 + paddsw mm2, mm4 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_v4_ssse3: + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + movq mm5, MMWORD PTR [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v4_ssse3_loop: + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + pmaddubsw mm3, mm6 + pmaddubsw mm2, mm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm5 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict16x16_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE +sym(vp8_bilinear_predict16x16_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(2) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 4 + lea rax, [rax + rcx] ; HFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line + + movdqa xmm2, [rax] + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ; dst_pitch +%endif + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 + + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm6, xmm5 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm6, xmm1 + + punpcklbw xmm4, xmm5 + pmaddubsw xmm4, xmm1 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + packuswb xmm6, xmm4 + movdqa xmm5, xmm7 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm2 + + punpckhbw xmm7, xmm6 + pmaddubsw xmm7, xmm2 + + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value + psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm5, xmm7 + movdqa xmm7, xmm6 + + movdqa [rdi], xmm5 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ; dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + + ; get the first horizontal line done + movq xmm4, [rsi] ; load row 0 + movq xmm2, [rsi + 8] ; load row 0 + + lea rsi, [rsi + rax] ; next line +.next_row_sp: + movq xmm3, [rsi] ; load row + 1 + movq xmm5, [rsi + 8] ; load row + 1 + + punpcklbw xmm4, xmm3 + punpcklbw xmm2, xmm5 + + pmaddubsw xmm4, xmm1 + movq xmm7, [rsi + rax] ; load row + 2 + + pmaddubsw xmm2, xmm1 + movq xmm6, [rsi + rax + 8] ; load row + 2 + + punpcklbw xmm3, xmm7 + punpcklbw xmm5, xmm6 + + pmaddubsw xmm3, xmm1 + paddw xmm4, [GLOBAL(rd)] + + pmaddubsw xmm5, xmm1 + paddw xmm2, [GLOBAL(rd)] + + psraw xmm4, VP8_FILTER_SHIFT + psraw xmm2, VP8_FILTER_SHIFT + + packuswb xmm4, xmm2 + paddw xmm3, [GLOBAL(rd)] + + movdqa [rdi], xmm4 ; store row 0 + paddw xmm5, [GLOBAL(rd)] + + psraw xmm3, VP8_FILTER_SHIFT + psraw xmm5, VP8_FILTER_SHIFT + + packuswb xmm3, xmm5 + movdqa xmm4, xmm7 + + movdqa [rdi + rdx],xmm3 ; store row 1 + lea rsi, [rsi + 2*rax] + + movdqa xmm2, xmm6 + lea rdi, [rdi + 2*rdx] + + cmp rdi, rcx + jne .next_row_sp + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + +.next_row_fp: + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm2, xmm4 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 + + pmaddubsw xmm2, xmm1 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rax] ; next line + punpcklbw xmm3, xmm4 + + pmaddubsw xmm3, xmm1 + movq xmm5, [rsi] + + paddw xmm2, [GLOBAL(rd)] + movq xmm7, [rsi+1] + + movq xmm6, [rsi+8] + psraw xmm2, VP8_FILTER_SHIFT + + punpcklbw xmm5, xmm7 + movq xmm7, [rsi+9] + + paddw xmm3, [GLOBAL(rd)] + pmaddubsw xmm5, xmm1 + + psraw xmm3, VP8_FILTER_SHIFT + punpcklbw xmm6, xmm7 + + packuswb xmm2, xmm3 + pmaddubsw xmm6, xmm1 + + movdqa [rdi], xmm2 ; store the results in the destination + paddw xmm5, [GLOBAL(rd)] + + lea rdi, [rdi + rdx] ; dst_pitch + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm6, VP8_FILTER_SHIFT + + packuswb xmm5, xmm6 + lea rsi, [rsi + rax] ; next line + + movdqa [rdi], xmm5 ; store the results in the destination + lea rdi, [rdi + rdx] ; dst_pitch + + cmp rdi, rcx + + jne .next_row_fp + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict8x8_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE +sym(vp8_bilinear_predict8x8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ; xoffset + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b8x8_sp_only + + shl rax, 4 + add rax, rcx ; HFilter + + mov rdi, arg(4) ; dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b8x8_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm1, [rax] + + ; get the first horizontal line done + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx + + psrldq xmm5, 1 + lea rsp, [rsp + 16] ; next line + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + lea rsp, [rsp + 16] ; next line + + movdqa xmm5, xmm6 + + psrldq xmm5, 1 + + punpcklbw xmm6, xmm5 + pmaddubsw xmm6, xmm0 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + packuswb xmm6, xmm6 + + punpcklbw xmm7, xmm6 + pmaddubsw xmm7, xmm1 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm7, xmm7 + + movq [rdi], xmm7 ; store the results in the destination + lea rdi, [rdi + rdx] + + movdqa xmm7, xmm6 + + cmp rdi, rcx + jne .next_row + + jmp .done8x8 + +.b8x8_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] ; VFilter + + movq xmm1, XMMWORD PTR [rsp] + movq xmm2, XMMWORD PTR [rsp+16] + + movq xmm3, XMMWORD PTR [rsp+32] + punpcklbw xmm1, xmm2 + + movq xmm4, XMMWORD PTR [rsp+48] + punpcklbw xmm2, xmm3 + + movq xmm5, XMMWORD PTR [rsp+64] + punpcklbw xmm3, xmm4 + + movq xmm6, XMMWORD PTR [rsp+80] + punpcklbw xmm4, xmm5 + + movq xmm7, XMMWORD PTR [rsp+96] + punpcklbw xmm5, xmm6 + + pmaddubsw xmm1, xmm0 + pmaddubsw xmm2, xmm0 + + pmaddubsw xmm3, xmm0 + pmaddubsw xmm4, xmm0 + + pmaddubsw xmm5, xmm0 + punpcklbw xmm6, xmm7 + + pmaddubsw xmm6, xmm0 + paddw xmm1, [GLOBAL(rd)] + + paddw xmm2, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm2, VP8_FILTER_SHIFT + + paddw xmm4, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + psraw xmm6, VP8_FILTER_SHIFT + packuswb xmm1, xmm1 + + packuswb xmm2, xmm2 + movq [rdi], xmm1 + + packuswb xmm3, xmm3 + movq [rdi+rdx], xmm2 + + packuswb xmm4, xmm4 + movq xmm1, XMMWORD PTR [rsp+112] + + lea rdi, [rdi + 2*rdx] + movq xmm2, XMMWORD PTR [rsp+128] + + packuswb xmm5, xmm5 + movq [rdi], xmm3 + + packuswb xmm6, xmm6 + movq [rdi+rdx], xmm4 + + lea rdi, [rdi + 2*rdx] + punpcklbw xmm7, xmm1 + + movq [rdi], xmm5 + pmaddubsw xmm7, xmm0 + + movq [rdi+rdx], xmm6 + punpcklbw xmm1, xmm2 + + pmaddubsw xmm1, xmm0 + paddw xmm7, [GLOBAL(rd)] + + psraw xmm7, VP8_FILTER_SHIFT + paddw xmm1, [GLOBAL(rd)] + + psraw xmm1, VP8_FILTER_SHIFT + packuswb xmm7, xmm7 + + packuswb xmm1, xmm1 + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm7 + + movq [rdi+rdx], xmm1 + lea rsp, [rsp + 144] + + jmp .done8x8 + +.b8x8_fp_only: + lea rcx, [rdi+rdx*8] + +.next_row_fp: + movdqa xmm1, XMMWORD PTR [rsp] + movdqa xmm3, XMMWORD PTR [rsp+16] + + movdqa xmm2, xmm1 + movdqa xmm5, XMMWORD PTR [rsp+32] + + psrldq xmm2, 1 + movdqa xmm7, XMMWORD PTR [rsp+48] + + movdqa xmm4, xmm3 + psrldq xmm4, 1 + + movdqa xmm6, xmm5 + psrldq xmm6, 1 + + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xmm0 + + punpcklbw xmm3, xmm4 + pmaddubsw xmm3, xmm0 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm0 + + movdqa xmm2, xmm7 + psrldq xmm2, 1 + + punpcklbw xmm7, xmm2 + pmaddubsw xmm7, xmm0 + + paddw xmm1, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm7, [GLOBAL(rd)] + psraw xmm7, VP8_FILTER_SHIFT + + packuswb xmm1, xmm1 + packuswb xmm3, xmm3 + + packuswb xmm5, xmm5 + movq [rdi], xmm1 + + packuswb xmm7, xmm7 + movq [rdi+rdx], xmm3 + + lea rdi, [rdi + 2*rdx] + movq [rdi], xmm5 + + lea rsp, [rsp + 4*16] + movq [rdi+rdx], xmm7 + + lea rdi, [rdi + 2*rdx] + cmp rdi, rcx + + jne .next_row_fp + + lea rsp, [rsp + 16] + +.done8x8: + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +shuf2bfrom1: + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 +align 16 +shuf3bfrom1: + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 + diff --git a/media/libvpx/vp8/common/x86/variance_impl_sse2.asm b/media/libvpx/vp8/common/x86/variance_impl_sse2.asm new file mode 100644 index 000000000..26de5e860 --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_impl_sse2.asm @@ -0,0 +1,972 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + +;void vp8_filter_block2d_bil_var_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE +sym(vp8_filter_block2d_bil_var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + push rbx + ; end prolog + + pxor xmm6, xmm6 ; + pxor xmm7, xmm7 ; + + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] + + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + movdqa xmm5, xmm1 + + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_sse2_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movdqa xmm3, xmm5 ; + movdqa xmm5, xmm1 ; + + pmullw xmm3, [rdx] ; + pmullw xmm1, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rbx] ;ref_pixels_per_line +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 ; + jnz filter_block2d_bil_var_sse2_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 + je filter_block2d_bil_var_sse2_full_pixel + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 ; + +filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movq xmm2, QWORD PTR [rdi] ; + punpcklbw xmm2, xmm0 ; + + psubw xmm1, xmm2 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_full_pixel_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(7) ; sum + mov rdi, arg(8) ; sumsquared + + movd [rsi], mm2 ; xsum + movd [rdi], mm4 ; xxsum + + ; begin epilog + pop rbx + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_vert_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE +sym(vp8_half_horiz_vert_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source +%else + add rsi, r8 +%endif + +vp8_half_horiz_vert_variance8x_h_1: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm2, QWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +vp8_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE +sym(vp8_half_vert_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; +vp8_half_vert_variance8x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_vert_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr + + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +vp8_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz vp8_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE +sym(vp8_half_horiz_variance8x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; +vp8_half_horiz_variance8x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz vp8_half_horiz_variance8x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + +vp8_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm new file mode 100644 index 000000000..686b4a902 --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm @@ -0,0 +1,364 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + + +;void vp8_filter_block2d_bil_var_ssse3 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int xoffset, +; int yoffset, +; int *sum, +; unsigned int *sumsquared;; +; +;) +;Note: The filter coefficient at offset=0 is 128. Since the second register +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. +global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE +sym(vp8_filter_block2d_bil_var_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .filter_block2d_bil_var_ssse3_sp_only + + shl rax, 4 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je .filter_block2d_bil_var_ssse3_fp_only + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi+1] + movdqa xmm2, xmm0 + + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, [rax] + pmaddubsw xmm2, [rax] + + paddw xmm0, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm0, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + packuswb xmm0, xmm2 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + r8] +%endif + +.filter_block2d_bil_var_ssse3_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + packuswb xmm1, xmm3 + + movdqa xmm2, xmm0 + movdqa xmm0, xmm1 + movdqa xmm3, xmm2 + + punpcklbw xmm2, xmm1 + punpckhbw xmm3, xmm1 + pmaddubsw xmm2, [rdx] + pmaddubsw xmm3, [rdx] + + paddw xmm2, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm2, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm1, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm1, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm2, xmm1 + psubw xmm3, xmm5 + paddw xmm6, xmm2 + paddw xmm6, xmm3 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm2 + paddd xmm7, xmm3 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rsi, [rsi + r8] + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_var_ssse3_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 + je .filter_block2d_bil_var_ssse3_full_pixel + + shl rdx, 4 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + movdqu xmm1, XMMWORD PTR [rsi] + movdqa xmm0, xmm1 + +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + lea rsi, [rsi + rax] + +.filter_block2d_bil_sp_only_loop: + movdqu xmm3, XMMWORD PTR [rsi] + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + + punpcklbw xmm1, xmm3 + punpckhbw xmm2, xmm3 + pmaddubsw xmm1, [rdx] + pmaddubsw xmm2, [rdx] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm2, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm2, xmm_filter_shift + + movq xmm3, QWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm3, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm3 + psubw xmm2, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + movdqa xmm1, xmm0 + lea rsi, [rsi + rax] ;ref_pixels_per_line + +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_sp_only_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_full_pixel: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + pxor xmm0, xmm0 + +.filter_block2d_bil_full_pixel_loop: + movq xmm1, QWORD PTR [rsi] + punpcklbw xmm1, xmm0 + movq xmm2, QWORD PTR [rsi+8] + punpcklbw xmm2, xmm0 + + movq xmm3, QWORD PTR [rdi] + punpcklbw xmm3, xmm0 + movq xmm4, QWORD PTR [rdi+8] + punpcklbw xmm4, xmm0 + + psubw xmm1, xmm3 + psubw xmm2, xmm4 + paddw xmm6, xmm1 + paddw xmm6, xmm2 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm7, xmm1 + paddd xmm7, xmm2 + + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rdx] ;src_pixels_per_line + sub rcx, 1 + jnz .filter_block2d_bil_full_pixel_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_var_ssse3_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +.filter_block2d_bil_fp_only_loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rsi+1] + movdqa xmm3, xmm1 + + punpcklbw xmm1, xmm2 + punpckhbw xmm3, xmm2 + pmaddubsw xmm1, [rax] + pmaddubsw xmm3, [rax] + + paddw xmm1, [GLOBAL(xmm_bi_rd)] + paddw xmm3, [GLOBAL(xmm_bi_rd)] + psraw xmm1, xmm_filter_shift + psraw xmm3, xmm_filter_shift + + movq xmm2, XMMWORD PTR [rdi] + pxor xmm4, xmm4 + punpcklbw xmm2, xmm4 + movq xmm5, QWORD PTR [rdi+8] + punpcklbw xmm5, xmm4 + + psubw xmm1, xmm2 + psubw xmm3, xmm5 + paddw xmm6, xmm1 + paddw xmm6, xmm3 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm7, xmm1 + paddd xmm7, xmm3 + + lea rsi, [rsi + rdx] +%if ABI_IS_32BIT + add rdi, dword ptr arg(3) ;src_pixels_per_line +%else + lea rdi, [rdi + r9] +%endif + + sub rcx, 1 + jnz .filter_block2d_bil_fp_only_loop + + jmp .filter_block2d_bil_variance + +.filter_block2d_bil_variance: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(7) ;[Sum] + mov rdi, arg(8) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 diff --git a/media/libvpx/vp8/common/x86/variance_ssse3.c b/media/libvpx/vp8/common/x86/variance_ssse3.c new file mode 100644 index 000000000..2a0df640a --- /dev/null +++ b/media/libvpx/vp8/common/x86/variance_ssse3.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" + +extern void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_ssse3 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance16x16_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0; + unsigned int xxsum0; + + /* note we could avoid these if statements if the calling function + * just called the appropriate functions inside. + */ + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_variance16x8_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0; + unsigned int xxsum0; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} diff --git a/media/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/vp8/common/x86/vp8_asm_stubs.c new file mode 100644 index 000000000..fb0b57eb1 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "filter_x86.h" + +extern const short vp8_six_tap_mmx[8][6*8]; + +extern void vp8_filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1dc_v6_mmx +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_unpack_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_filter_block1d8_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); + + +#if HAVE_MMX +void vp8_sixtap_predict4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); + +} + + +void vp8_sixtap_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + + HFilter = vp8_six_tap_mmx[xoffset]; + + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); + +} + + +void vp8_sixtap_predict8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); + +} + + +void vp8_sixtap_predict8x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); + +} + + + +void vp8_bilinear_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); +} +#endif + + +#if HAVE_SSE2 +void vp8_sixtap_predict16x16_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } +} + + +void vp8_sixtap_predict8x8_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); + } +} + + +void vp8_sixtap_predict8x4_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); + } +} + +#endif + +#if HAVE_SSSE3 + +extern void vp8_filter_block1d8_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d8_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +void vp8_sixtap_predict16x16_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[24*24]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 16, 21, xoffset); + vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, + 16, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 16, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 16, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict8x8_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 13, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 8, yoffset); + } + else + { + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 8, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 8, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + + +void vp8_sixtap_predict8x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 9, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict4x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[4*9]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + FData2, 4, 9, xoffset); + vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + int r; + + for (r = 0; r < 4; r++) + { + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr += dst_pitch; + src_ptr += src_pixels_per_line; + } + } + } +} + +#endif diff --git a/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm b/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm new file mode 100644 index 000000000..97f25275d --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define mmx_filter_shift 7 + +;void vp8_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +.filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + + +;void vp8_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +.filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/media/libvpx/vp8/common/x86/vp8_variance_mmx.c b/media/libvpx/vp8/common/x86/vp8_variance_mmx.c new file mode 100644 index 000000000..e594b1e65 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_mmx.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" + +extern void filter_block1d_h6_mmx +( + const unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *filter +); +extern void filter_block1d_v6_mmx +( + const short *src_ptr, + unsigned char *output_ptr, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *filter +); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance4x4_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) + +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); + + +} + +unsigned int vp8_sub_pixel_variance16x8_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_mmx +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +} + + +unsigned int vp8_variance_halfpixvar16x16_h_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, + ref_ptr, recon_stride, sse); +} diff --git a/media/libvpx/vp8/common/x86/vp8_variance_sse2.c b/media/libvpx/vp8/common/x86/vp8_variance_sse2.c new file mode 100644 index 000000000..1c15ed880 --- /dev/null +++ b/media/libvpx/vp8/common/x86/vp8_variance_sse2.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_config.h" +#include "vp8/common/variance.h" +#include "vpx_ports/mem.h" +#include "vp8/common/x86/filter_x86.h" + +extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); +extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +void vp8_filter_block2d_bil_var_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance16x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); + +unsigned int vp8_sub_pixel_variance4x4_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum, &xxsum); + } + + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + /* note we could avoid these if statements if the calling function + * just called the appropriate functions inside. + */ + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0 + ); + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum1, &xxsum1 + ); + xsum0 += xsum1; + xxsum0 += xxsum1; + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_variance16x8_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum1, &xxsum1); + xsum0 += xsum1; + xxsum0 += xxsum1; + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_wmt +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance8x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum, &xxsum); + } + + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +} + + +unsigned int vp8_variance_halfpixvar16x16_h_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0; + unsigned int xxsum0; + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); +} |