diff options
Diffstat (limited to 'media/libvpx/vpx_dsp')
25 files changed, 6714 insertions, 0 deletions
diff --git a/media/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/vpx_dsp/arm/sad4d_neon.c new file mode 100644 index 000000000..c7704dc1b --- /dev/null +++ b/media/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), + vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), + vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, +// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo +// and vec_sum_ref_hi. +static void sad_neon_64(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8x16_t vec_src_32, + const uint8x16_t vec_src_48, + const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); +} + +// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, +// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. +static void sad_neon_32(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); +} + +void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + + sad_neon_32(vec_src_00, vec_src_16, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref0 = vld1q_u8(ref0); + const uint8x16_t vec_ref1 = vld1q_u8(ref1); + const uint8x16_t vec_ref2 = vld1q_u8(ref2); + const uint8x16_t vec_ref3 = vld1q_u8(ref3); + + vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref0)); + vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref0)); + vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref1)); + vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref1)); + vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref2)); + vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref2)); + vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref3)); + vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref3)); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} diff --git a/media/libvpx/vpx_dsp/arm/sad_media.asm b/media/libvpx/vpx_dsp/arm/sad_media.asm new file mode 100644 index 000000000..aed1d3a22 --- /dev/null +++ b/media/libvpx/vpx_dsp/arm/sad_media.asm @@ -0,0 +1,95 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_sad16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +|vpx_sad16x16_media| PROC + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/media/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/vpx_dsp/arm/sad_neon.c new file mode 100644 index 000000000..173f08ac3 --- /dev/null +++ b/media/libvpx/vpx_dsp/arm/sad_neon.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +unsigned int vpx_sad8x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vpx_sad4x4_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int vpx_sad16x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), + vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), + vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/media/libvpx/vpx_dsp/arm/variance_media.asm b/media/libvpx/vpx_dsp/arm/variance_media.asm new file mode 100644 index 000000000..f7f9e14b0 --- /dev/null +++ b/media/libvpx/vpx_dsp/arm/variance_media.asm @@ -0,0 +1,358 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_variance16x16_media| + EXPORT |vpx_variance8x8_media| + EXPORT |vpx_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on vpx_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|vpx_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/media/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/vpx_dsp/arm/variance_neon.c new file mode 100644 index 000000000..ede6e7bbb --- /dev/null +++ b/media/libvpx/vpx_dsp/arm/variance_neon.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, uint32_t *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vpx_get8x8var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 +} + +unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 +} + +unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, + b + (32 * b_stride), b_stride, 32, 32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, + b + (16 * 2 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, + b + (16 * 3 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 +} + +unsigned int vpx_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_mse16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int vpx_get4x4sse_cs_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/media/libvpx/vpx_dsp/sad.c b/media/libvpx/vpx_dsp/sad.c new file mode 100644 index 000000000..c0c3ff996 --- /dev/null +++ b/media/libvpx/vpx_dsp/sad.c @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up. +/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred. + * The function averages every corresponding element of the buffers and stores + * the value in a third buffer, comp_pred. + * pred and comp_pred are assumed to have stride = width + * In the usage below comp_pred is a local array. + */ +static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define sadMxN(m, n) \ +unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ +} \ +unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ +} + +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. +#define sadMxNxK(m, n, k) \ +void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ +} + +// This appears to be equivalent to the above when k == 4 and refs is const +#define sadMxNx4D(m, n) \ +void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ +} + +// 64x64 +sadMxN(64, 64) +sadMxNxK(64, 64, 3) +sadMxNxK(64, 64, 8) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNxK(32, 32, 3) +sadMxNxK(32, 32, 8) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNxK(16, 16, 3) +sadMxNxK(16, 16, 8) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNxK(16, 8, 3) +sadMxNxK(16, 8, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNxK(8, 16, 3) +sadMxNxK(8, 16, 8) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNxK(8, 8, 3) +sadMxNxK(8, 8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNxK(8, 4, 8) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNxK(4, 8, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNxK(4, 4, 3) +sadMxNxK(4, 4, 8) +sadMxNx4D(4, 4) + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, + const uint16_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define highbd_sadMxN(m, n) \ +unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ +} \ +unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ +} + +#define highbd_sadMxNxK(m, n, k) \ +void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \ + ref_stride); \ + } \ +} + +#define highbd_sadMxNx4D(m, n) \ +void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \ + ref_stride); \ + } \ +} + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxNxK(64, 64, 3) +highbd_sadMxNxK(64, 64, 8) +highbd_sadMxNx4D(64, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxNx4D(64, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxNx4D(32, 64) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxNxK(32, 32, 3) +highbd_sadMxNxK(32, 32, 8) +highbd_sadMxNx4D(32, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxNx4D(32, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxNx4D(16, 32) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxNxK(16, 16, 3) +highbd_sadMxNxK(16, 16, 8) +highbd_sadMxNx4D(16, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxNxK(16, 8, 3) +highbd_sadMxNxK(16, 8, 8) +highbd_sadMxNx4D(16, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxNxK(8, 16, 3) +highbd_sadMxNxK(8, 16, 8) +highbd_sadMxNx4D(8, 16) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxNxK(8, 8, 3) +highbd_sadMxNxK(8, 8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxNxK(8, 4, 8) +highbd_sadMxNx4D(8, 4) + +// 4x8 +highbd_sadMxN(4, 8) +highbd_sadMxNxK(4, 8, 8) +highbd_sadMxNx4D(4, 8) + +// 4x4 +highbd_sadMxN(4, 4) +highbd_sadMxNxK(4, 4, 3) +highbd_sadMxNxK(4, 4, 8) +highbd_sadMxNx4D(4, 4) + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/vpx_dsp/variance.c b/media/libvpx/vpx_dsp/variance.c new file mode 100644 index 000000000..084dd7b7e --- /dev/null +++ b/media/libvpx/vpx_dsp/variance.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +unsigned int vpx_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#define VAR(W, H) \ +unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ +void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse, int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +} + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ +unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ +} + +VAR(64, 64) +VAR(64, 32) +VAR(32, 64) +VAR(32, 32) +VAR(32, 16) +VAR(16, 32) +VAR(16, 16) +VAR(16, 8) +VAR(8, 16) +VAR(8, 8) +VAR(8, 4) +VAR(4, 8) +VAR(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ +unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define HIGHBD_GET_VAR(S) \ +void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} + +#define HIGHBD_MSE(W, H) \ +unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +HIGHBD_VAR(64, 64) +HIGHBD_VAR(64, 32) +HIGHBD_VAR(32, 64) +HIGHBD_VAR(32, 32) +HIGHBD_VAR(32, 16) +HIGHBD_VAR(16, 32) +HIGHBD_VAR(16, 16) +HIGHBD_VAR(16, 8) +HIGHBD_VAR(8, 16) +HIGHBD_VAR(8, 8) +HIGHBD_VAR(8, 4) +HIGHBD_VAR(4, 8) +HIGHBD_VAR(4, 4) + +void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/vpx_dsp/vpx_dsp_rtcd.c new file mode 100644 index 000000000..d2476039d --- /dev/null +++ b/media/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_once.h" +#ifdef _MSC_VER +#include <intrin.h> +#endif + +void vpx_dsp_rtcd() { + once(setup_rtcd_internal); +} diff --git a/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 000000000..95cc4372e --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,289 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 diff --git a/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 000000000..4d422dde3 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,365 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 + + +; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 + +; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 + + +; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..923418a99 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,313 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +sym(vpx_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +sym(vpx_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..343c0478b --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} diff --git a/media/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/vpx_dsp/x86/sad4d_avx2.c new file mode 100644 index 000000000..793658f9e --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_sad32x32x4d_avx2(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32 ; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} + +void vpx_sad64x64x4d_avx2(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64 ; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} diff --git a/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm new file mode 100644 index 000000000..0f7fb93d4 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,233 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m6, [ref1q+%5] + punpckldq m4, [ref2q+%5] + punpckldq m7, [ref3q+%5] + punpckldq m5, [ref4q+%5] + psadbw m6, m0 + psadbw m4, m0 + psadbw m7, m0 + psadbw m5, m0 + punpckldq m6, m4 + punpckldq m7, m5 +%else + movd m1, [ref1q+%3] + movd m2, [ref2q+%3] + movd m3, [ref3q+%3] + movd m4, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m1, [ref1q+%5] + punpckldq m2, [ref2q+%5] + punpckldq m3, [ref3q+%5] + punpckldq m4, [ref4q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + punpckldq m1, m2 + punpckldq m3, m4 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if mmsize == 16 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 + +INIT_MMX sse +SADNXN4D 4, 8 +SADNXN4D 4, 4 diff --git a/media/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/vpx_dsp/x86/sad_avx2.c new file mode 100644 index 000000000..ce9ad8f78 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_avx2.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +#define FSAD64_H(h) \ +unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0 ; i < h ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref_stride; \ + src_ptr+= src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSAD32_H(h) \ +unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0 ; i < max ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref2_stride; \ + src_ptr+= src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSAD64 \ +FSAD64_H(64); \ +FSAD64_H(32); + +#define FSAD32 \ +FSAD32_H(64); \ +FSAD32_H(32); \ +FSAD32_H(16); + +FSAD64; +FSAD32; + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ +unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0 ; i < h ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref_stride; \ + src_ptr+= src_stride; \ + second_pred+= 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSADAVG32_H(h) \ +unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0 ; i < max ; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ + sad1_reg = _mm256_sad_epu8(ref1_reg, \ + _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8(ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = _mm256_add_epi32(sum_sad, \ + _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr+= ref2_stride; \ + src_ptr+= src2_stride; \ + second_pred+= 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ +} + +#define FSADAVG64 \ +FSADAVG64_H(64); \ +FSADAVG64_H(32); + +#define FSADAVG32 \ +FSADAVG32_H(64); \ +FSADAVG32_H(32); \ +FSADAVG32_H(16); + +FSADAVG64; +FSADAVG32; + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/media/libvpx/vpx_dsp/x86/sad_mmx.asm b/media/libvpx/vpx_dsp/x86/sad_mmx.asm new file mode 100644 index 000000000..9968992bd --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_mmx.asm @@ -0,0 +1,427 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vpx_sad16x16_mmx) PRIVATE +global sym(vpx_sad8x16_mmx) PRIVATE +global sym(vpx_sad8x8_mmx) PRIVATE +global sym(vpx_sad4x4_mmx) PRIVATE +global sym(vpx_sad16x8_mmx) PRIVATE + +;unsigned int vpx_sad16x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddw mm0, mm2 + paddw mm1, mm3 + + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + paddw mm7, mm1 + + cmp rsi, rcx + jne .x16x16sad_mmx_loop + + + movq mm0, mm7 + + punpcklwd mm0, mm6 + punpckhwd mm7, mm6 + + paddw mm0, mm7 + movq mm7, mm0 + + + psrlq mm0, 32 + paddw mm7, mm0 + + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + paddw mm7, mm2 + cmp rsi, rcx + + jne .x8x16sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad8x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x8x8sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + paddw mm0, mm2 + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + cmp rsi, rcx + + jne .x8x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad4x4_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] + + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + pxor mm3, mm3 + + punpcklbw mm0, mm3 + punpckhbw mm2, mm3 + + paddw mm0, mm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm4, DWORD PTR [rsi] + movd mm5, DWORD PTR [rdi] + + movd mm6, DWORD PTR [rsi+rax] + movd mm7, DWORD PTR [rdi+rdx] + + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + + movq mm6, mm4 + psubusb mm4, mm5 + + psubusb mm5, mm6 + por mm4, mm5 + + movq mm5, mm4 + punpcklbw mm4, mm3 + + punpckhbw mm5, mm3 + paddw mm4, mm5 + + paddw mm0, mm4 + movq mm1, mm0 + + punpcklwd mm0, mm3 + punpckhwd mm1, mm3 + + paddw mm0, mm1 + movq mm1, mm0 + + psrlq mm0, 32 + paddw mm0, mm1 + + movq rax, mm0 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_sad16x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vpx_sad16x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +.x16x8sad_mmx_loop: + + movq mm0, [rsi] + movq mm1, [rdi] + + movq mm2, [rsi+8] + movq mm3, [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpckhbw mm1, mm6 + + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + + + paddw mm0, mm2 + paddw mm1, mm3 + + paddw mm0, mm1 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne .x16x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movq rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/vpx_dsp/x86/sad_sse2.asm new file mode 100644 index 000000000..c6a829dc2 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse2.asm @@ -0,0 +1,269 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define program_name vpx + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 + +; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 + +; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 + +; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 + +; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m3, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m6, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m6 + psadbw m1, m2 + psadbw m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movd eax, m0 + RET +%endmacro + +INIT_MMX sse +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/media/libvpx/vpx_dsp/x86/sad_sse3.asm b/media/libvpx/vpx_dsp/x86/sad_sse3.asm new file mode 100644 index 000000000..18279bdb9 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse3.asm @@ -0,0 +1,374 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +%macro PROCESS_16X2X3 5 +%if %1==0 + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm5, XMMWORD PTR [%3] + lddqu xmm6, XMMWORD PTR [%3+1] + lddqu xmm7, XMMWORD PTR [%3+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm1, XMMWORD PTR [%3] + lddqu xmm2, XMMWORD PTR [%3+1] + lddqu xmm3, XMMWORD PTR [%3+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [%2+%4] + lddqu xmm1, XMMWORD PTR [%3+%5] + lddqu xmm2, XMMWORD PTR [%3+%5+1] + lddqu xmm3, XMMWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 5 +%if %1==0 + movq mm0, QWORD PTR [%2] + movq mm5, QWORD PTR [%3] + movq mm6, QWORD PTR [%3+1] + movq mm7, QWORD PTR [%3+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, QWORD PTR [%2] + movq mm1, QWORD PTR [%3] + movq mm2, QWORD PTR [%3+1] + movq mm3, QWORD PTR [%3+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [%2+%4] + movq mm1, QWORD PTR [%3+%5] + movq mm2, QWORD PTR [%3+%5+1] + movq mm3, QWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +;void int vpx_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x16x3_sse3) PRIVATE +sym(vpx_sad16x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x8x3_sse3) PRIVATE +sym(vpx_sad16x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad8x16x3_sse3) PRIVATE +sym(vpx_sad8x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad8x8x3_sse3) PRIVATE +sym(vpx_sad8x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int vpx_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad4x4x3_sse3) PRIVATE +sym(vpx_sad4x4x3_sse3): + + STACK_FRAME_CREATE_X3 + + movd mm0, DWORD PTR [src_ptr] + movd mm1, DWORD PTR [ref_ptr] + + movd mm2, DWORD PTR [src_ptr+src_stride] + movd mm3, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, DWORD PTR [ref_ptr+1] + movd mm5, DWORD PTR [ref_ptr+2] + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + lea src_ptr, [src_ptr+src_stride*2] + lea ref_ptr, [ref_ptr+ref_stride*2] + + movd mm0, DWORD PTR [src_ptr] + movd mm2, DWORD PTR [ref_ptr] + + movd mm3, DWORD PTR [src_ptr+src_stride] + movd mm6, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, DWORD PTR [ref_ptr+1] + movd mm7, DWORD PTR [ref_ptr+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rcx, result_ptr + + punpckldq mm1, mm3 + + movq [rcx], mm1 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/vpx_dsp/x86/sad_sse4.asm b/media/libvpx/vpx_dsp/x86/sad_sse4.asm new file mode 100644 index 000000000..bc6744797 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_sse4.asm @@ -0,0 +1,359 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro + +;void vpx_sad16x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(vpx_sad16x16x8_sse4_1) PRIVATE +sym(vpx_sad16x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad16x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad16x8x8_sse4_1) PRIVATE +sym(vpx_sad16x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad8x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad8x8x8_sse4_1) PRIVATE +sym(vpx_sad8x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad8x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad8x16x8_sse4_1) PRIVATE +sym(vpx_sad8x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_sad4x4x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vpx_sad4x4x8_sse4_1) PRIVATE +sym(vpx_sad4x4x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/media/libvpx/vpx_dsp/x86/sad_ssse3.asm b/media/libvpx/vpx_dsp/x86/sad_ssse3.asm new file mode 100644 index 000000000..49f204fa0 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/sad_ssse3.asm @@ -0,0 +1,370 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm7, XMMWORD PTR [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm3, XMMWORD PTR [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + movdqa xmm4, XMMWORD PTR [rdi+rdx] + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int vpx_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x16x3_ssse3) PRIVATE +sym(vpx_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .vpx_sad16x16x3_ssse3_skiptable +.vpx_sad16x16x3_ssse3_jumptable: + dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_skiptable: + + call .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 + +.vpx_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.vpx_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void int vpx_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vpx_sad16x8x3_ssse3) PRIVATE +sym(vpx_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .vpx_sad16x8x3_ssse3_skiptable +.vpx_sad16x8x3_ssse3_jumptable: + dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_skiptable: + + call .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 + +.vpx_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.vpx_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..82cef4af0 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} diff --git a/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c new file mode 100644 index 000000000..0e40959aa --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 + +#include "./vpx_dsp_rtcd.h" + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (src_ptr))); + src = _mm256_inserti128_si256(src, + _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); + + ref =_mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (ref_ptr))); + ref = _mm256_inserti128_si256(ref, + _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= src_2strides; + ref_ptr+= ref_2strides; + } + + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); + + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm_unpacklo_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + expand_madd_high = _mm_unpackhi_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); + + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + + *((int*)SSE)= _mm_cvtsi128_si32(madd_res); + + *((int*)Sum)= _mm_cvtsi128_si32(sum_res); + } +} + +void vpx_get32x32var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *) (src_ptr)); + + ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= source_stride; + ref_ptr+= recon_stride; + } + + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + + // extract the low lane and the high lane and add the results + *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + + *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } +} diff --git a/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm b/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm new file mode 100644 index 000000000..a8d7d99db --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_impl_mmx.asm @@ -0,0 +1,424 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) +global sym(vpx_get_mb_ss_mmx) PRIVATE +sym(vpx_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +.NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja .NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get8x8var_mmx) PRIVATE +sym(vpx_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;void +;vpx_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get4x4var_mmx) PRIVATE +sym(vpx_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movd mm0, [rax] ; Copy four bytes to mm0 + movd mm1, [rbx] ; Copy four bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher precision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy four bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vpx_dsp/x86/variance_mmx.c b/media/libvpx/vpx_dsp/x86/variance_mmx.c new file mode 100644 index 000000000..99dd741bc --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_mmx.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + return (var - (((unsigned int)avg * avg) >> 4)); +} + +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 6)); +} + +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 8)); +} + +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 7)); +} + +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 7)); +} diff --git a/media/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..6256bc536 --- /dev/null +++ b/media/libvpx/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" + +typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); +} + +unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} |