diff options
Diffstat (limited to 'third_party/aom/aom_dsp/arm')
-rw-r--r-- | third_party/aom/aom_dsp/arm/avg_neon.c | 38 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/bilinear_filter_media.asm | 240 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/sad_media.asm | 98 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/subpel_variance_media.c | 81 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm | 185 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm | 225 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm | 187 | ||||
-rw-r--r-- | third_party/aom/aom_dsp/arm/variance_media.asm | 361 |
8 files changed, 0 insertions, 1415 deletions
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c index e730ccbcc..6ff760017 100644 --- a/third_party/aom/aom_dsp/arm/avg_neon.c +++ b/third_party/aom/aom_dsp/arm/avg_neon.c @@ -25,44 +25,6 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { return vget_lane_u32(c, 0); } -unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} - -unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); - - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; -} - // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int aom_satd_neon(const int16_t *coeff, int length) { diff --git a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm deleted file mode 100644 index 17b7d25f9..000000000 --- a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm +++ /dev/null @@ -1,240 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_filter_block2d_bil_first_pass_media| - EXPORT |aom_filter_block2d_bil_second_pass_media| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *aom_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|aom_filter_block2d_bil_first_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; aom_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |aom_filter_block2d_bil_first_pass_media| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *aom_filter -;--------------------------------- -|aom_filter_block2d_bil_second_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; aom_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |aom_filter_block2d_second_pass_media| - - END diff --git a/third_party/aom/aom_dsp/arm/sad_media.asm b/third_party/aom/aom_dsp/arm/sad_media.asm deleted file mode 100644 index 49ddb6764..000000000 --- a/third_party/aom/aom_dsp/arm/sad_media.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_sad16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 const unsigned char *src_ptr -; r1 int src_stride -; r2 const unsigned char *ref_ptr -; r3 int ref_stride -|aom_sad16x16_media| PROC - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - mov r4, #0 ; sad = 0; - mov r5, #8 ; loop count - -loop - ; 1st row - ldr r6, [r0, #0x0] ; load 4 src pixels (1A) - ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) - ldr r7, [r0, #0x4] ; load 4 src pixels (1A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) - ldr r10, [r0, #0x8] ; load 4 src pixels (1B) - ldr r11, [r0, #0xC] ; load 4 src pixels (1B) - - usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - ldr r6, [r0, #0x0] ; load 4 src pixels (2A) - ldr r7, [r0, #0x4] ; load 4 src pixels (2A) - add r4, r4, r8 ; add partial sad values - - ; 2nd row - ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) - ldr r10, [r0, #0x8] ; load 4 src pixels (2B) - ldr r11, [r0, #0xC] ; load 4 src pixels (2B) - - usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - subs r5, r5, #1 ; decrement loop counter - add r4, r4, r8 ; add partial sad values - - bne loop - - mov r0, r4 ; return sad - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_media.c b/third_party/aom/aom_dsp/arm/subpel_variance_media.c deleted file mode 100644 index 46ec028d3..000000000 --- a/third_party/aom/aom_dsp/arm/subpel_variance_media.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -#if HAVE_MEDIA -static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, - { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, - { 32, 96 }, { 16, 112 } }; - -extern void aom_filter_block2d_bil_first_pass_media( - const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -extern void aom_filter_block2d_bil_second_pass_media( - const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -unsigned int aom_sub_pixel_variance8x8_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[10 * 8]; - uint8_t second_pass[8 * 8]; - const int16_t *HFilter, *VFilter; - - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass, - src_pixels_per_line, 9, 8, HFilter); - aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, - VFilter); - - return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, - sse); -} - -unsigned int aom_sub_pixel_variance16x16_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[36 * 16]; - uint8_t second_pass[20 * 16]; - const int16_t *HFilter, *VFilter; - unsigned int var; - - if (xoffset == 4 && yoffset == 0) { - var = aom_variance_halfpixvar16x16_h_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 0 && yoffset == 4) { - var = aom_variance_halfpixvar16x16_v_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 4 && yoffset == 4) { - var = aom_variance_halfpixvar16x16_hv_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else { - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - aom_filter_block2d_bil_first_pass_media( - src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); - aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, - 16, VFilter); - - var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, - sse); - } - return var; -} -#endif // HAVE_MEDIA diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm deleted file mode 100644 index 1e5c9178e..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm +++ /dev/null @@ -1,185 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_h_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_h_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm deleted file mode 100644 index 9e0af830e..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm +++ /dev/null @@ -1,225 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_hv_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_hv_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; pointer to pixels on the next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load source pixels a, row N - ldr r6, [r0, #1] ; load source pixels b, row N - ldr r5, [r9, #0] ; load source pixels c, row N+1 - ldr r7, [r9, #1] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #0] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load source pixels a, row N - ldr r6, [r0, #5] ; load source pixels b, row N - ldr r5, [r9, #4] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #5] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #4] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load source pixels a, row N - ldr r6, [r0, #9] ; load source pixels b, row N - ldr r5, [r9, #8] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #9] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #8] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load source pixels a, row N - ldr r6, [r0, #13] ; load source pixels b, row N - ldr r5, [r9, #12] ; load source pixels c, row N+1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - ldr r7, [r9, #13] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #12] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm deleted file mode 100644 index 545b68179..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm +++ /dev/null @@ -1,187 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_v_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_v_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; set src pointer to next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r9, #0] ; load 4 src pixels from next row - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r9, #4] ; load 4 src pixels from next row - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r9, #8] ; load 4 src pixels from next row - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r9, #12] ; load 4 src pixels from next row - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/third_party/aom/aom_dsp/arm/variance_media.asm b/third_party/aom/aom_dsp/arm/variance_media.asm deleted file mode 100644 index fdc311a81..000000000 --- a/third_party/aom/aom_dsp/arm/variance_media.asm +++ /dev/null @@ -1,361 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance16x16_media| - EXPORT |aom_variance8x8_media| - EXPORT |aom_mse16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance16x16_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop16x16 - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop16x16 - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance8x8_media| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop8x8 - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop8x8 - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on aom_variance16x16_media. In this function, sum is never used. -; So, we can remove this part of calculation. - -|aom_mse16x16_media| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loopmse - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loopmse - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END |