diff options
Diffstat (limited to 'media/libvpx/vp8/common/arm/neon')
17 files changed, 6082 insertions, 0 deletions
diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 000000000..9824a3193 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const uint8_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8; + uint16x8_t q1u16, q2u16; + uint16x8_t q7u16, q8u16, q9u16; + uint64x2_t q4u64, q5u64; + uint64x1_t d12u64; + uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2; + + if (xoffset == 0) { // skip_1stpass_filter + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0); + src_ptr += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1); + src_ptr += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + } else { + d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d6u8 = vld1_u8(src_ptr); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); + q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); + d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), + vreinterpret_u32_u8(vget_high_u8(q1u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), + vreinterpret_u32_u8(vget_high_u8(q2u8))); + d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), + vreinterpret_u32_u64(vget_high_u64(q4u64))); + d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), + vreinterpret_u32_u64(vget_high_u64(q5u64))); + + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q9u16 = vmull_u8(d6u8, d0u8); + + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8); + q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8); + + d28u8 = vqrshrn_n_u16(q7u16, 7); + d29u8 = vqrshrn_n_u16(q8u16, 7); + d30u8 = vqrshrn_n_u16(q9u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d28u8, d0u8); + q2u16 = vmull_u8(d29u8, d0u8); + + d26u8 = vext_u8(d28u8, d29u8, 4); + d27u8 = vext_u8(d29u8, d30u8, 4); + + q1u16 = vmlal_u8(q1u16, d26u8, d1u8); + q2u16 = vmlal_u8(q2u16, d27u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + } + return; +} + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 000000000..deced115c --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 000000000..ad5f41d7d --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 000000000..58e11922c --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 000000000..54e709dd3 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 000000000..fb327a726 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +/* place these declarations here because we don't want to maintain them + * outside of this scope + */ +void idct_dequant_full_2x_neon(short *q, short *dq, + unsigned char *dst, int stride); +void idct_dequant_0_2x_neon(short *q, short dq, + unsigned char *dst, int stride); + + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dst, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q+32, dq, dst+8, stride); + else + idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride); + } + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + dstu += 4*stride; + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } + + q += 32; + dstv += 4*stride; + + if (((short *)(eobs))[3]) + { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 000000000..967c32280 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32, d4s32; + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c new file mode 100644 index 000000000..a60ed46b7 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +void idct_dequant_full_2x_neon( + int16_t *q, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), + vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), + vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), + vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), + vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 000000000..6ea9dd712 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c new file mode 100644 index 000000000..9d6807af7 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +static INLINE void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#ifdef VPX_INCOMPATIBLE_GCC + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#else + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#endif // VPX_INCOMPATIBLE_GCC +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 000000000..b25686ffb --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), + vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 000000000..e1c8609e3 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], + result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#else +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#endif // VPX_INCOMPATIBLE_GCC + + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), + vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), + vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), + vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), + vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#else +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#endif // VPX_INCOMPATIBLE_GCC + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p, d0u8x4); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p, d1u8x4); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), + vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 000000000..5351f4be6 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" + +static INLINE void vp8_mbloop_filter_neon( + uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16 , 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/reconintra_neon.c b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c new file mode 100644 index 000000000..af52cd5ea --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/common/blockd.h" + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { + const int mode = x->mode_info_context->mbmi.mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x16_t v_expected_dc = vdupq_n_u8(128); + + if (shift) { + unsigned int average = 0; + int expected_dc; + if (x->up_available) { + const uint8x16_t v_above = vld1q_u8(yabove_row); + const uint16x8_t a = vpaddlq_u8(v_above); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + average = vget_lane_u32(d, 0); + } + if (x->left_available) { + for (i = 0; i < 16; ++i) { + average += yleft[0]; + yleft += left_stride; + } + } + shift += 3; + expected_dc = (average + (1 << (shift - 1))) >> shift; + v_expected_dc = vmovq_n_u8((uint8_t)expected_dc); + } + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_expected_dc); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_above); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 16; ++i) { + const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]); + yleft += left_stride; + vst1q_u8(ypred_ptr, v_yleft); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]); + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]); + const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft); + const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft); + const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo), + vreinterpretq_s16_u16(v_ytop_left)); + const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi), + vreinterpretq_s16_u16(v_ytop_left)); + const uint8x8_t pred_lo = vqmovun_s16(b_lo); + const uint8x8_t pred_hi = vqmovun_s16(b_hi); + + vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi)); + ypred_ptr += y_stride; + yleft += left_stride; + } + } + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { + const int mode = x->mode_info_context->mbmi.uv_mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x8_t v_expected_udc = vdup_n_u8(128); + uint8x8_t v_expected_vdc = vdup_n_u8(128); + + if (shift) { + unsigned int average_u = 0; + unsigned int average_v = 0; + int expected_udc; + int expected_vdc; + if (x->up_available) { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove)); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0); + average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2); + } + if (x->left_available) { + for (i = 0; i < 8; ++i) { + average_u += uleft[0]; + uleft += left_stride; + average_v += vleft[0]; + vleft += left_stride; + } + } + shift += 2; + expected_udc = (average_u + (1 << (shift - 1))) >> shift; + expected_vdc = (average_v + (1 << (shift - 1))) >> shift; + v_expected_udc = vmov_n_u8((uint8_t)expected_udc); + v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc); + } + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_expected_udc); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_expected_vdc); + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_uabove); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vabove); + vpred_ptr += pred_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]); + uleft += left_stride; + vleft += left_stride; + vst1_u8(upred_ptr, v_uleft); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vleft); + vpred_ptr += pred_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]); + const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]); + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]); + const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft); + const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft); + const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u), + vreinterpretq_s16_u16(v_utop_left)); + const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v), + vreinterpretq_s16_u16(v_vtop_left)); + const uint8x8_t pred_u = vqmovun_s16(b_u); + const uint8x8_t pred_v = vqmovun_s16(b_v); + + vst1_u8(upred_ptr, pred_u); + vst1_u8(vpred_ptr, pred_v); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + uleft += left_stride; + vleft += left_stride; + } + } + break; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 000000000..373afa6ed --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_neon( + int16_t *input, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), + vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 000000000..4c2efc92b --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1754 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/mem.h" + +static const int8_t vp8_sub_pel_filters[8][8] = { + {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ + {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -9, 93, 50, -6, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {0, -6, 50, 93, -9, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -1, 12, 123, -6, 0, 0, 0}, +}; + +void vp8_sixtap_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; + uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; + uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; + uint32x2x2_t d0u32x2, d1u32x2; + + if (xoffset == 0) { // secondpass_filter4x4_only + uint32x2_t d27u32 = vdup_n_u32(0); + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + uint32x2_t d31u32 = vdup_n_u32(0); + + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); + src += src_pixels_per_line; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); + src += src_pixels_per_line; + d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); + + d27u8 = vreinterpret_u8_u32(d27u32); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + d31u8 = vreinterpret_u8_u32(d31u32); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + + // keep original src data in q4 q6 + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + + d27u8 = vqrshrun_n_s16(q7s16, 7); + d28u8 = vqrshrun_n_s16(q8s16, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + return; + } + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q11u8 = vld1q_u8(src); + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + q12u16 = vmull_u8(d31u8, d5u8); + + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + q11u16 = vmull_u8(d31u8, d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + q12s16 = vqaddq_s16(q12s16, q11s16); + + d29u8 = vqrshrun_n_s16(q7s16, 7); + d30u8 = vqrshrun_n_s16(q8s16, 7); + d31u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 4x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; +} + +void vp8_sixtap_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; +} + +void vp8_sixtap_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) + return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; +} + +void vp8_sixtap_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; i++) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + d14u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; i++) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c new file mode 100644 index 000000000..974d3b653 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c @@ -0,0 +1,1017 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +static const uint16_t bilinear_taps_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +unsigned int vp8_sub_pixel_variance16x16_neon_func( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + int i; + DECLARE_ALIGNED(16, unsigned char, tmp[528]); + unsigned char *tmpp; + unsigned char *tmpp2; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + tmpp2 = tmp + 272; + tmpp = tmp; + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } else if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp2, q7u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q8u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q9u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q10u8); + tmpp2 += 16; + } + } else { + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + tmpp = tmp; + tmpp2 = tmpp + 272; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } + + // sub_pixel_variance16x16_neon + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + tmpp = tmp + 272; + for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop + q0u8 = vld1q_u8(tmpp); + tmpp += 16; + q1u8 = vld1q_u8(tmpp); + tmpp += 16; + q2u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + q3u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); + q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_h_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; + uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q11u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q12u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q13u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q14u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q1u8 = vextq_u8(q0u8, q1u8, 1); + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); + q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); + q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); + q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); + q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); + q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); + + d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); + d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); + q9s32 = vmlal_s16(q9s32, d8s16, d8s16); + q10s32 = vmlal_s16(q10s32, d9s16, d9s16); + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); + q9s32 = vmlal_s16(q9s32, d10s16, d10s16); + q10s32 = vmlal_s16(q10s32, d11s16, d11s16); + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); + q9s32 = vmlal_s16(q9s32, d12s16, d12s16); + q10s32 = vmlal_s16(q10s32, d13s16, d13s16); + d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); + d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); + q9s32 = vmlal_s16(q9s32, d14s16, d14s16); + q10s32 = vmlal_s16(q10s32, d15s16, d15s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_v_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q15u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + + q1u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q4u8); + q4u8 = vrhaddq_u8(q4u8, q6u8); + q6u8 = vrhaddq_u8(q6u8, q15u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d8u8 = vget_low_u8(q4u8); + d9u8 = vget_high_u8(q4u8); + d12u8 = vget_low_u8(q6u8); + d13u8 = vget_high_u8(q6u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); + q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); + q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); + q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); + q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); + q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + + q0u8 = q15u8; + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_hv_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; + int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; + int32x4_t q13s32, q14s32, q15s32; + int64x2_t q0s64, q1s64, q5s64; + + q13s32 = vdupq_n_s32(0); + q14s32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q1u8 = vextq_u8(q0u8, q1u8, 1); + q0u8 = vrhaddq_u8(q0u8, q1u8); + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q8u8 = vld1q_u8(src_ptr); + q9u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + q9u8 = vextq_u8(q8u8, q9u8, 1); + + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + q4u8 = vrhaddq_u8(q8u8, q9u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q1u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q3u8); + q3u8 = vrhaddq_u8(q3u8, q4u8); + + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q6u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q8u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); + q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); + q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); + q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); + q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); + q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); + q14s32 = vmlal_s16(q14s32, d18s16, d18s16); + q15s32 = vmlal_s16(q15s32, d19s16, d19s16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); + q14s32 = vmlal_s16(q14s32, d20s16, d20s16); + q15s32 = vmlal_s16(q15s32, d21s16, d21s16); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); + q14s32 = vmlal_s16(q14s32, d22s16, d22s16); + q15s32 = vmlal_s16(q15s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); + q14s32 = vmlal_s16(q14s32, d24s16, d24s16); + q15s32 = vmlal_s16(q15s32, d25s16, d25s16); + + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); + q14s32 = vmlal_s16(q14s32, d0s16, d0s16); + q15s32 = vmlal_s16(q15s32, d1s16, d1s16); + + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); + q14s32 = vmlal_s16(q14s32, d2s16, d2s16); + q15s32 = vmlal_s16(q15s32, d3s16, d3s16); + + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); + q14s32 = vmlal_s16(q14s32, d10s16, d10s16); + q15s32 = vmlal_s16(q15s32, d11s16, d11s16); + + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); + q14s32 = vmlal_s16(q14s32, d12s16, d12s16); + q15s32 = vmlal_s16(q15s32, d13s16, d13s16); + + q0u8 = q4u8; + } + + q15s32 = vaddq_s32(q14s32, q15s32); + q0s64 = vpaddlq_s32(q13s32); + q1s64 = vpaddlq_s32(q15s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +#define FILTER_BITS 7 + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (8 * 8)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint16_t *vpx_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp8_sub_pixel_variance8x8_neon( + const unsigned char *src, + int src_stride, + int xoffset, + int yoffset, + const unsigned char *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]); + DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); + if (xoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8, + 8, bilinear_taps_coeff[yoffset]); + } else if (yoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, 1, + 9, 8, + bilinear_taps_coeff[xoffset]); + } else { + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, + 9, 8, + bilinear_taps_coeff[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, + 8, bilinear_taps_coeff[yoffset]); + } + return variance8x8_neon(temp2, 8, dst, dst_stride, sse); +} |